format: html: embed-resources: true


In [ ]:
API_KEY='396dd8714fbc4b4fa24b537d26e3879e'
In [ ]:
import requests
import json
import re
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from datetime import datetime
/Users/chendong/opt/anaconda3/lib/python3.8/site-packages/pandas/core/computation/expressions.py:20: UserWarning: Pandas requires version '2.7.3' or newer of 'numexpr' (version '2.7.1' currently installed).
  from pandas.core.computation.check import NUMEXPR_INSTALLED
In [ ]:
baseURL = "https://newsapi.org/v2/everything?"
total_requests=2
verbose=True
TOPIC='Disney'
In [ ]:
def FormURL(TOPIC):
    URLpost = {'apiKey': API_KEY,
               'q': '+'+TOPIC,
               'sortBy': 'relevancy',
               'totalRequests': 1}

# print(baseURL)
# print(URLpost)

#GET DATA FROM API
    response = requests.get(baseURL, URLpost) #request data from the server
# print(response.url);  
    response = response.json() #extract txt data from request into json
    return(response)
# PRETTY PRINT
# https://www.digitalocean.com/community/tutorials/python-pretty-print-json

# print(json.dumps(response, indent=2))

# #GET TIMESTAMP FOR PULL REQUEST
    
    # timestamp = datetime.now().strftime("%Y-%m-%d-H%H-M%M-S%S")

# SAVE TO FILE 
# with open(timestamp+'-newapi-raw-data.json', 'w') as outfile:
    # json.dump(response, outfile, indent=4)
In [ ]:
def string_cleaner(input_string):
    try: 
        out=re.sub(r"""
                    [,.;@#?!&$-]+  # Accept one or more copies of punctuation
                    \ *           # plus zero or more copies of a space,
                    """,
                    " ",          # and replace it with a single space
                    input_string, flags=re.VERBOSE)

        #REPLACE SELECT CHARACTERS WITH NOTHING
        out = re.sub('[’.]+', '', input_string)

        #ELIMINATE DUPLICATE WHITESPACES USING WILDCARDS
        out = re.sub(r'\s+', ' ', out)

        #CONVERT TO LOWER CASE
        out=out.lower()
    except:
        print("ERROR")
        out=''
    return out
In [ ]:
def CleanJSON(response, TOPIC):    
    article_list=response['articles']   #list of dictionaries for each article
    article_keys=article_list[0].keys()
#print("AVAILABLE KEYS:")
#print(article_keys)
    #index=0
    cleaned_data=[];  
    text_description = [];
    for article in article_list:
        tmp=[]

        for key in article_keys:

        # if(key=='source'):
        #     src=string_cleaner(article[key]['name'])
        #     tmp.append(src) 

        # if(key=='author'):
        #     author=string_cleaner(article[key])
            #ERROR CHECK (SOMETIMES AUTHOR IS SAME AS PUBLICATION)
        #     if(src in author): 
        #         print(" AUTHOR ERROR:",author);author='NA'
        #     tmp.append(author)

            if(key=='title'):
                text_description.append(string_cleaner(article[key]))

            if(key=='description'):
                text_description.append(string_cleaner(article[key]))

            if(key=='content'):
                tmp.append(string_cleaner(article[key]))
            
        # if(key=='publishedAt'):
            #DEFINE DATA PATERN FOR RE TO CHECK  .* --> wildcard
        #     ref = re.compile('.*-.*-.*T.*:.*:.*Z')
        #     date=article[key]
        #     if(not ref.match(date)):
        #         print(" DATE ERROR:",date); date="NA"
        #     tmp.append(date)
        
        # index += 1
        cleaned_data.append(tmp)
        df = pd.DataFrame(cleaned_data)
        #print(df)
        #print(cleaned_data)
        #print(text_description)
        df.to_csv(TOPIC+'_cleaned_news.csv', index=False)
        return text_description, cleaned_data
In [ ]:
import wikipedia
with open(TOPIC+"_cleaned_wiki.txt", "w") as file:
    file.write(wikipedia.summary(TOPIC))
In [ ]:
from wordcloud import WordCloud, STOPWORDS
import matplotlib.pyplot as plt
#import matplotlib
# MODIFIED FROM 
# https://towardsdatascience.com/simple-wordcloud-in-python-2ae54a9f58e5
def generate_word_cloud(my_text):
    # exit()
    # Import package
    # Define a function to plot word cloud
    #plt.rcParams['font.family'] = 'DejaVu Sans'
    def plot_cloud(wordcloud):
        # Set figure size
        plt.figure(figsize=(40, 30))
        # Display image
        plt.imshow(wordcloud) 
        # No axis details
        plt.axis("off");

    # Generate word cloud
    wordcloud = WordCloud(
        width = 3000,
        height = 2000, 
        random_state=1, 
        background_color='salmon', 
        colormap='Pastel1', 
        collocations=False,
        stopwords = STOPWORDS).generate(my_text)
    plot_cloud(wordcloud)
    plt.show()

#generate_word_cloud(text)
generate_word_cloud(str(CleanJSON(FormURL(TOPIC), TOPIC)[0]))
generate_word_cloud(wikipedia.summary(TOPIC))
im a disney adult, and i dont care who knows it i live in disney t-shirts, i was nearly a friend of tinker bell (thats a euphemism for playing a character in the parks), i have been to disney world … [+10007 chars]
[['im a disney adult, and i dont care who knows it i live in disney t-shirts, i was nearly a friend of tinker bell (thats a euphemism for playing a character in the parks), i have been to disney world … [+10007 chars]'], ['im a disney adult, and i dont care who knows it i live in disney t-shirts, i was nearly a friend of tinker bell (thats a euphemism for playing a character in the parks), i have been to disney world … [+10007 chars]'], ['im a disney adult, and i dont care who knows it i live in disney t-shirts, i was nearly a friend of tinker bell (thats a euphemism for playing a character in the parks), i have been to disney world … [+10007 chars]'], ['im a disney adult, and i dont care who knows it i live in disney t-shirts, i was nearly a friend of tinker bell (thats a euphemism for playing a character in the parks), i have been to disney world … [+10007 chars]'], ['im a disney adult, and i dont care who knows it i live in disney t-shirts, i was nearly a friend of tinker bell (thats a euphemism for playing a character in the parks), i have been to disney world … [+10007 chars]'], ['im a disney adult, and i dont care who knows it i live in disney t-shirts, i was nearly a friend of tinker bell (thats a euphemism for playing a character in the parks), i have been to disney world … [+10007 chars]'], ['im a disney adult, and i dont care who knows it i live in disney t-shirts, i was nearly a friend of tinker bell (thats a euphemism for playing a character in the parks), i have been to disney world … [+10007 chars]'], ['im a disney adult, and i dont care who knows it i live in disney t-shirts, i was nearly a friend of tinker bell (thats a euphemism for playing a character in the parks), i have been to disney world … [+10007 chars]']]
In [ ]:
import pandas as pd
import numpy as np
In [ ]:
# read data
df = pd.read_csv("../data/raw-data/news_r.csv")
In [ ]:
## REMOVE SPACES FROM COLUMN NAMES
df.rename(columns=lambda x: x.strip(), inplace=True)
df.rename(columns={"publishedAt": 'date'}, inplace=True)
df = df.drop(columns=["Unnamed: 0"])
In [ ]:
#CONVERT TYPECAST
df["date"] = pd.to_datetime(df["date"])
In [ ]:
df = df.drop_duplicates()
In [ ]:
df['content'] = df['content'].apply(lambda x : np.nan if "http" in x else x)
df['description'] = df['description'].apply(lambda x : np.nan if "http" in x else x)
df = df.replace('[Removed]', np.nan)
In [ ]:
df = df.dropna(axis = 0)
In [ ]:
for char in ['[', ']', '&', '$', '<ul>', '<li>', '</li>', '+', 'chars', 'amp', '#', '…', '®', '{', '}', '...', '%']:
    df['content'] = df['content'].apply(lambda x : x.replace(char, ''))
    df['description'] = df['description'].apply(lambda x : x.replace(char, ''))
    df['title'] = df['title'].apply(lambda x : x.replace(char, ''))
In [ ]:
import re
df['content'] = df['content'].apply(lambda x: re.sub(r'[0-9]', '', x))
df['description'] = df['description'].apply(lambda x: re.sub(r'[0-9]', '', x))
df['title'] = df['title'].apply(lambda x: re.sub(r'[0-9]', '', x))
In [ ]:
df.to_csv("../data/modified-data/clean_text.csv")
In [ ]:
from sklearn.feature_extraction.text import CountVectorizer
corpus = df['title'].tolist()
corpus.extend(df['description'].tolist())
vectorizer = CountVectorizer()
vectorizer.fit(corpus)
print("Vocabulary: ", vectorizer.vocabulary_)
import csv
with open('../data/clean-data/vocabulary.csv', 'w') as csv_file:  
    writer = csv.writer(csv_file)
    for key, value in vectorizer.vocabulary_.items():
       writer.writerow([key, value])
vector = vectorizer.transform(corpus)
# Summarizing the Encoded Texts
print("Encoded Document is:\n", vector.toarray())
pd.DataFrame(vector.toarray()).to_csv("../data/modified-data/encode.csv")
Vocabulary:  {'sunny': 1112, 'or': 796, 'akshay': 37, 'who': 1298, 'will': 1303, 'you': 1335, 'watch': 1275, 'loki': 656, 'season': 997, 'temporal': 1143, 'loom': 663, 'explained': 387, 'what': 1290, 'is': 586, 'the': 1151, 'tva': 1212, 'machine': 670, 'that': 1150, 'created': 262, 'mcu': 695, 'sacred': 979, 'timeline': 1177, 'stack': 1071, 'spotify': 1070, 'to': 1182, 'launch': 623, 'video': 1247, 'ads': 23, 'on': 788, 'roku': 967, 'meta': 708, 'considers': 243, 'ad': 17, 'free': 448, 'tier': 1171, 'in': 563, 'eu': 370, 'my': 743, 'disney': 315, 'top': 1191, 'mickey': 711, 'not': 770, 'so': 1046, 'scary': 987, 'halloween': 504, 'party': 826, 'today': 1183, 'wordle': 1314, 'hints': 530, 'clues': 220, 'and': 58, 'answer': 63, 'for': 441, 'friday': 450, 'october': 776, 'th': 1147, 'it': 589, 'pivotal': 842, 'year': 1330, 'labor': 615, 'strikes': 1093, 'charts': 207, 'tell': 1141, 'story': 1084, 'americans': 52, 'are': 73, 'unleashing': 1222, 'monsters': 727, 'they': 1157, 'have': 516, 'no': 767, 'idea': 554, 'how': 548, 'contain': 245, 'make': 677, 'your': 1337, 'home': 540, 'merry': 707, 'with': 1308, 'holiday': 537, 'decor': 290, 'netflix': 751, 'india': 570, 'subscribers': 1099, 'dwarfed': 335, 'by': 171, 'prime': 874, 'bernstein': 123, 'says': 985, 'mission': 721, 'raniganj': 903, 'box': 143, 'office': 782, 'collection': 225, 'can': 178, 'kumar': 613, 'deliver': 294, 'back': 95, 'superhits': 1115, 'day': 278, 'doesn': 320, 'look': 660, 'promising': 884, 'shares': 1023, 'rebound': 917, 'amid': 53, 'broader': 157, 'market': 684, 'downturn': 328, 'lego': 636, 'pixar': 843, 'up': 1230, 'house': 547, 'building': 163, 'set': 1016, 'only': 791, 'why': 1300, 'wasn': 1274, 'ob': 775, 'memory': 703, 'wiped': 1307, 'he': 518, 'remains': 930, 'epic': 362, 'games': 463, 'fortnite': 444, 'longer': 659, 'bringing': 153, 'bacon': 97, 'addition': 18, 'layoffs': 628, 'we': 1282, 're': 912, 'going': 483, 'charge': 204, 'tv': 1211, 'film': 421, 'producers': 878, 'like': 643, 'seat': 999, 'unreal': 1224, 'engine': 358, 'users': 1235, 'followup': 438, 'world': 1319, 'cup': 271, 'shubman': 1034, 'gill': 474, 'reportedly': 940, 'down': 327, 'dengue': 296, 'might': 714, 'miss': 720, 'opener': 793, 'match': 689, 'against': 29, 'australia': 85, 'where': 1293, 'find': 427, 'birmingham': 132, 'city': 214, 'vs': 1261, 'west': 1289, 'brom': 159, 'us': 1234, 'nct': 747, 'became': 110, 'pop': 856, 'unapologetic': 1218, 'mavericks': 691, 'marvel': 687, 'sets': 1017, 'tom': 1186, 'hiddleston': 525, 'replacement': 938, 'post': 860, 'credits': 264, 'scene': 988, 'legal': 634, 'drama': 329, 'suits': 1109, 'which': 1295, 'streams': 1091, 'peacock': 830, 'hits': 534, 'weeks': 1285, 'at': 78, 'nielsen': 761, 'streaming': 1090, 'rankings': 906, 'setting': 1018, 'record': 922, 'most': 732, 'rick': 958, 'porter': 859, 'hollywood': 539, 'reporter': 941, 'all': 41, 'ahsoka': 34, 'episodes': 364, 'ranked': 904, 'from': 452, 'worst': 1322, 'wizard': 1310, 'ein': 344, 'li': 640, 'eretz': 367, 'acheret': 7, 'other': 799, 'land': 617, 'shell': 1028, 'malaysia': 679, 'launches': 625, 'star': 1073, 'wars': 1272, 'racers': 898, 'remote': 932, 'control': 250, 'cars': 186, 'designs': 298, 'rm': 965, 'eastpak': 340, 'six': 1039, 'years': 1332, 'after': 27, 'metoo': 710, 'entertainment': 359, 'employees': 352, 'believe': 119, 'culture': 270, 'of': 779, 'abuse': 2, 'misconduct': 719, 'has': 512, 'improved': 562, 'survey': 1125, 'finds': 428, 'episode': 363, 'ending': 355, 'premiere': 865, 'mobius': 723, 'mostly': 733, 'excellent': 376, 'adventure': 24, 'end': 354, 'time': 1175, 'slipping': 1042, 'god': 481, 'mischief': 718, 'keeps': 604, 'glitching': 477, 'teases': 1138, 'kang': 603, 'secret': 1001, 'connection': 241, 'spoiler': 1064, 'ep': 360, 'about': 1, 'straddling': 1085, 'line': 644, 'between': 126, 'intrigue': 582, 'confusion': 240, 'stay': 1077, 'tuned': 1207, 'because': 111, 'biggest': 130, 'questions': 895, 'review': 956, 'slip': 1041, 'slide': 1040, 'must': 742, 'see': 1004, 'ties': 1172, 'thor': 1162, 'comic': 228, 'books': 140, 'spoilers': 1065, 'here': 523, 'resolves': 947, 'huge': 549, 'cliffhanger': 219, 'round': 973, 'goes': 482, 'behind': 117, 'attraction': 81, 'again': 28, 'does': 319, 'character': 202, 'gives': 476, 'emergency': 350, 'signal': 1036, 'as': 77, 'brawl': 146, 'breaks': 151, 'out': 803, 'among': 54, 'guests': 499, 'once': 789, 'upon': 1233, 'studio': 1096, 'stills': 1082, 'feature': 404, 'classic': 217, 'characters': 203, 'live': 651, 'action': 9, 'surroundings': 1124, 'disneyland': 316, 'banned': 100, 'viral': 1256, 'tiktok': 1173, 'trend': 1203, 'still': 1081, 'features': 406, 'jonathan': 597, 'majors': 676, 'conqueror': 242, 'victor': 1246, 'timely': 1178, 'elderly': 345, 'man': 680, 'found': 445, 'collapsed': 224, 'bushes': 166, 'lebron': 631, 'james': 592, 'lakers': 616, 'preseason': 868, 'but': 168, 'healthy': 520, 'murders': 741, 'finale': 425, 'was': 1273, 'big': 129, 'hulu': 551, 'sony': 1054, 'pictures': 838, 'core': 252, 'ps': 891, 'adds': 20, 'new': 755, 'plus': 851, 'benefits': 122, 'early': 339, 'movie': 734, 'access': 4, 'another': 62, 'becoming': 112, 'streamed': 1088, 'title': 1180, 'times': 1179, 'mary': 688, 'poppins': 857, 'legend': 635, 'celebrates': 191, 'birthday': 133, 'crowds': 268, 'suffer': 1105, 'dangers': 275, 'florida': 433, 'virgin': 1257, 'river': 964, 'final': 424, 'batch': 102, 'promos': 886, 'minutes': 717, 'greatest': 493, 'vinicius': 1254, 'front': 453, 'page': 817, 'montage': 728, 'pinochius': 840, 'shocks': 1029, 'spanish': 1061, 'public': 893, 'real': 915, 'madrid': 672, 'take': 1130, 'pet': 835, 'sematary': 1009, 'bloodlines': 136, 'dick': 304, 'butkus': 169, 'nfl': 759, 'prolific': 883, 'dead': 280, 'fearsome': 403, 'hall': 503, 'fame': 396, 'chicago': 209, 'bears': 109, 'linebacker': 646, 'dies': 308, 'long': 658, 'awaited': 91, 'open': 792, 'ahead': 33, 'schedule': 990, 'apple': 72, 'ghosts': 471, 'thriller': 1166, 'cocaine': 222, 'bear': 108, 'darkness': 276, 'within': 1309, 'la': 614, 'luz': 668, 'del': 292, 'mundo': 740, 'when': 1292, 'golden': 484, 'bachelor': 94, 'burning': 165, 'abc': 0, 'boss': 142, 'casting': 189, 'hometowns': 541, 'fantasy': 401, 'suites': 1108, 'america': 51, 'be': 105, 'talking': 1133, 'taylor': 1135, 'swift': 1126, 'donates': 325, 'vip': 1255, 'eras': 366, 'tour': 1195, 'tickets': 1170, 'selena': 1008, 'gomez': 485, 'charity': 205, 'fund': 458, 'work': 1316, 'this': 1161, 'london': 657, 'event': 372, 'elemental': 346, 'more': 731, 'popular': 858, 'than': 1148, 'little': 650, 'mermaid': 706, 'guardians': 497, 'galaxy': 462, 'overall': 807, 'claims': 216, 'spot': 1069, 'week': 1283, 'tops': 1192, 'originals': 798, 'franchise': 447, 'coming': 230, 'linear': 645, 'networks': 753, 'movies': 735, 'soon': 1055, 'channels': 201, 'brooke': 160, 'an': 56, 'update': 1232, 'walt': 1265, 'tradition': 1198, 'animal': 59, 'crossing': 266, 'comes': 227, 'price': 873, 'hikes': 528, 'news': 757, 'odds': 777, 'ends': 357, 'park': 822, 'sued': 1104, 'woman': 1311, 'gynecologic': 501, 'injuries': 572, 'community': 233, 'rallies': 902, 'around': 74, 'contentious': 248, 'lawsuit': 627, 'schrodinger': 992, 'snow': 1045, 'white': 1297, 'actress': 14, 'rachel': 899, 'ziegler': 1339, 'creates': 263, 'controversy': 251, 'interview': 579, 'comments': 231, 'studios': 1097, 'scrapping': 993, 'everything': 375, 'do': 317, 'blade': 135, 'save': 984, 'universe': 1221, 'least': 630, 'his': 531, 'own': 811, 'show': 1032, 'turner': 1210, 'nets': 752, 'now': 773, 'share': 1021, 'rights': 962, 'air': 35, 'films': 422, 'brings': 154, 'stars': 1075, 'festival': 413, 'held': 521, 'captive': 181, 'unsafe': 1225, 'conditions': 237, 'spooktacular': 1067, 'love': 665, 'letter': 639, 'horror': 543, 'icon': 552, 'ranking': 905, 'wheel': 1291, 'lioness': 647, 'rise': 963, 'hotel': 546, 'guest': 498, 'suffers': 1106, 'attack': 79, 'huis': 550, 'ten': 1144, 'bosch': 141, 'wants': 1267, 'decision': 289, 'casino': 187, 'project': 881, 'breaking': 150, 'moana': 722, 'epcot': 361, 'zegler': 1338, 'head': 519, 'shared': 1022, 'under': 1219, 'deal': 282, 'ed': 341, 'sheeran': 1027, 'explains': 388, 'had': 502, 'grave': 491, 'dug': 334, 'property': 888, 'frozen': 454, 'stems': 1079, 'incredible': 567, 'director': 312, 'hilton': 529, 'families': 398, 'one': 790, 'ott': 800, 'sukanya': 1110, 'verma': 1241, 'her': 522, 'recommendations': 921, 'introduced': 583, 'incredibly': 568, 'important': 560, 'piece': 839, 'technology': 1139, 'prepares': 867, 'through': 1167, 'contemplates': 247, 'offering': 781, 'social': 1047, 'platforms': 847, 'bring': 152, 'app': 68, 'announced': 60, 'roll': 969, 'chris': 213, 'highlights': 527, 'spookiest': 1066, 'moments': 725, 'magic': 673, 'kingdom': 609, 'yearly': 1331, 'celebration': 193, 'solve': 1050, 'daily': 273, 'also': 47, 'play': 848, 'competitive': 235, 'learn': 629, 'each': 337, 'word': 1313, 'makes': 678, 'particularly': 825, 'different': 309, 'walkouts': 1264, 'private': 875, 'sector': 1002, 'workers': 1317, 'present': 869, 'difficult': 310, 'politics': 853, 'get': 469, 'sorted': 1057, 'way': 1280, 'avoids': 90, 'calamity': 174, 'biden': 128, 'administration': 21, 'proxy': 890, 'war': 1268, 'russia': 978, 'ukraine': 1217, 'appears': 71, 'been': 113, 'lost': 664, 'battlefield': 104, 'midst': 713, 'meth': 709, 'cough': 255, 'syrup': 1127, 'pagea': 818, 'shopdisney': 1030, 'dropped': 332, 'ton': 1187, 'merchandise': 705, 'couldn': 257, 'excited': 377, 'eyeing': 391, 'available': 89, 'wait': 1262, 'decorate': 291, 'let': 638, 'options': 795, 'think': 1159, 'wreaths': 1325, 'succeeded': 1102, 'scaling': 986, 'its': 591, 'business': 167, 'despite': 299, 'global': 478, 'giant': 472, 'consistently': 244, 'lowering': 666, 'subscription': 1100, 'costs': 253, 'country': 259, 'analysts': 57, 'alliancebernstein': 44, 'wrote': 1328, 'report': 939, 'clients': 218, 'thursday': 1168, 'streamer': 1089, 'predicted': 862, 'poor': 855, 'opening': 794, 'performance': 833, 'grab': 488, 'cute': 272, 'great': 492, 'over': 806, 'amazon': 50, 'additional': 19, 'details': 301, 'include': 565, 'give': 475, 'fan': 400, 'any': 64, 'kid': 607, 'lives': 652, 'high': 526, 'flying': 434, 'adventures': 25, 'gift': 473, 'full': 456, 'confirmed': 239, 'agents': 31, 'variance': 1238, 'authority': 87, 'got': 486, 'their': 1152, 'memories': 702, 'ouroboros': 802, 'played': 849, 'some': 1051, 'spectacular': 1062, 'innings': 573, 'odi': 778, 'cricket': 265, 'recent': 919, 'fever': 415, 'tested': 1146, 'before': 114, 'call': 175, 'taken': 1131, 'matter': 690, 'television': 1140, 'via': 1245, 'efl': 343, 'chionship': 212, 'pm': 852, 'et': 369, 'pt': 892, 'espn': 368, 'stream': 1087, 'nine': 765, 'member': 700, 'group': 496, 'always': 48, 'genre': 466, 'innovative': 574, 'bands': 98, 'album': 38, 'fact': 393, 'check': 208, 'doubles': 326, 'gloss': 479, 'packed': 815, 'sound': 1058, 'warning': 1271, 'smash': 1044, 'hit': 533, 'series': 1012, 'officially': 784, 'better': 125, 'ever': 373, 'looks': 662, 'titular': 1181, 'isn': 588, 'burdened': 164, 'debut': 286, 'mer': 704, 'dave': 277, 'filoni': 423, 'come': 226, 'reflect': 924, 'could': 256, 'very': 1244, 'well': 1288, 'best': 124, 'projects': 882, 'since': 1037, 'george': 468, 'lucas': 667, 'era': 365, 'sermon': 1013, 'gave': 464, 'kol': 611, 'nidrei': 760, 'sometimes': 1052, 'song': 1053, 'takes': 1132, 'residence': 945, 'brain': 144, 'happens': 508, 'ear': 338, 'worm': 1320, 'me': 696, 'car': 183, 'bit': 134, 'unusual': 1227, 'looking': 661, 'rc': 911, 'sale': 981, 'till': 1174, 'november': 772, 'made': 671, 'racer': 897, 'first': 431, 'iconic': 553, 'adorning': 22, 'emblematic': 349, 'models': 724, 'appeared': 70, 'fucking': 455, 'young': 1336, 'hashtag': 513, 'took': 1190, 'off': 780, 'bombshell': 139, 'exposés': 389, 'harvey': 511, 'weinstein': 1286, 'number': 774, 'industry': 571, 'feel': 408, 'progress': 880, 'wif': 1302, 'nonprofit': 769, 'organizatio': 797, 'ground': 495, 'running': 976, 'culminates': 269, 'twist': 1215, 'filled': 420, 'tone': 1188, 'second': 1000, 'outing': 805, 'stinger': 1083, 'means': 697, 'listen': 649, 'laufeyson': 622, 'unstuck': 1226, 'shows': 1033, 'finally': 426, 'returned': 952, 'importantly': 561, 'did': 305, 'chicken': 210, 'mcnuggets': 694, 'really': 916, 'exist': 382, 'unlikely': 1223, 'hero': 524, 'having': 517, 'rough': 972, 'few': 416, 'stage': 1072, 'exciting': 378, 'drops': 333, 'history': 532, 'if': 555, 'two': 1216, 'felt': 412, 'dream': 330, 'don': 323, 'worry': 1321, 'supposed': 1117, 'happening': 507, 'according': 6, 'executive': 381, 'producer': 877, 'kevin': 605, 'wright': 1326, 'crafting': 261, 'returns': 953, 'owen': 809, 'wilson': 1304, 'sophia': 1056, 'di': 303, 'martino': 686, 'surprise': 1121, 'broxton': 162, 'oklahoma': 785, 'comics': 229, 'raises': 901, 'interesting': 577, 'break': 147, 'know': 610, 'ravonna': 910, 'explain': 386, 'too': 1189, 'much': 736, 'audience': 84, 'feels': 409, 'carry': 185, 'tells': 1142, 'thewrap': 1156, 'contains': 246, 'try': 1206, 'ways': 1281, 'solo': 1049, 'month': 729, 'far': 402, 'premie': 864, 'yes': 1333, 'indeed': 569, 'reunites': 954, 'familiar': 397, 'face': 392, 'past': 827, 'jump': 600, 'resolve': 946, 'breakdown': 148, 'future': 460, 'robert': 966, 'niles': 764, 'ready': 914, 'scenes': 989, 'theme': 1154, 'attractions': 82, 'drop': 331, 'featured': 405, 'jung': 601, 'follow': 436, 'footsteps': 440, 'many': 682, 'beware': 127, 'performer': 834, 'alerted': 40, 'management': 681, 'fight': 418, 'broke': 158, 'parade': 820, 'viewing': 1251, 'bystander': 172, 'captured': 182, 'incident': 564, 'place': 844, 'paris': 821, 'resort': 948, 'para': 819, 'released': 927, 'handful': 505, 'featuring': 407, 'whole': 1299, 'host': 544, 'sharing': 1024, 'screen': 994, 'them': 1153, 'there': 1155, 'anything': 65, 'want': 1266, 'parks': 823, 'vacation': 1236, 'wrong': 1327, 'side': 1035, 'security': 1003, 'unfortunately': 1220, 'seems': 1005, 'though': 1164, 'several': 1019, 'people': 831, 'decided': 288, 'hard': 509, 'strict': 1092, 'meas': 698, 'said': 980, 'hasty': 515, 'changes': 200, 'production': 879, 'timeframe': 1176, 'fell': 411, 'ill': 556, 'riding': 960, 'twilight': 1214, 'zone': 1340, 'tower': 1196, 'terror': 1145, 'imagineers': 558, 'versions': 1243, 'worl': 1318, 'plan': 845, 'three': 1165, 'fall': 395, 'third': 1160, 'watched': 1276, 'based': 101, 'views': 1252, 'didn': 306, 'provide': 889, 'further': 459, 'confirm': 238, 'rema': 928, 'focused': 435, 'called': 176, 'launched': 624, 'playstation': 850, 'markets': 685, 'including': 566, 'canada': 179, 'rebrand': 918, 'previous': 872, 'bravia': 145, 'tvs': 1213, 'phones': 836, 'allow': 45, 'buy': 170, 'rent': 936, 'holder': 535, 'ozark': 814, 'list': 648, 'four': 446, 'seasons': 998, 'beloved': 120, 'actor': 13, 'starred': 1074, 'celebrating': 192, 'cementing': 195, 'oldest': 787, 'living': 653, 'age': 30, 'remain': 929, 'empty': 353, 'multiple': 737, 'experts': 385, 'warn': 1269, 'visiting': 1259, 'dangerous': 274, 'summer': 1111, 'unusually': 1228, 'tranquil': 1200, 'traditionally': 1199, 'landed': 618, 'september': 1011, 'result': 951, 'surpassed': 1119, 'finishes': 430, 'changed': 199, 'our': 801, 'goal': 480, 'preserving': 870, 'integrity': 576, 'hasn': 514, 'course': 260, 'accidents': 5, 'happen': 506, 'arrived': 76, 'promo': 885, 'videos': 1248, 'release': 926, 'seaso': 996, 'valencia': 1237, 'sports': 1068, 'newspaper': 758, 'superdeporte': 1113, 'left': 633, 'speechless': 1063, 'harsh': 510, 'depiction': 297, 'jr': 599, 'apos': 67, 'pinochiusapos': 841, 'edited': 342, 'screenwriter': 995, 'turned': 1209, 'boasts': 138, 'lengthy': 637, 'ip': 585, 'involvement': 584, 'trek': 1202, 'afforded': 26, 'row': 974, 'tarantino': 1134, 'expounding': 390, 'straight': 1086, 'rated': 908, 'former': 443, 'countless': 258, 'commercials': 232, 'died': 307, 'official': 783, 'cause': 190, 'death': 284, 'family': 399, 'tribune': 1204, 'peacefully': 828, 'ap': 66, 'middle': 712, 'team': 1136, 'journey': 598, 'water': 1278, 'inspired': 575, 'following': 437, 'steps': 1080, 'dwayne': 336, 'johnson': 595, 'pulling': 894, 'planning': 846, 'night': 762, 'couch': 254, 'picks': 837, 'newest': 756, 'multiversal': 738, 'last': 620, 'premiered': 866, 'launching': 626, 'old': 786, 'everyone': 374, 'strong': 1094, 'ratings': 909, 'million': 716, 'viewers': 1250, 'watching': 1277, 'delayed': 293, 'delivering': 295, 'strongest': 1095, 'john': 594, 'shearer': 1026, 'getty': 470, 'images': 557, 'needed': 748, 'large': 619, 'ticket': 1169, 'item': 590, 'rare': 907, 'impact': 559, 'benefit': 121, 'auction': 83, 'she': 1025, 'friend': 451, 'responded': 950, 'grammy': 489, 'winner': 1306, 'donated': 324, 'conc': 236, 'sees': 1007, 'hosting': 545, 'activation': 11, 'oxo': 813, 'south': 1059, 'bank': 99, 'would': 1323, 'rude': 975, 'attend': 80, 'teamed': 1137, 'auth': 86, 'slow': 1043, 'beginnings': 116, 'surprised': 1122, 'naysayers': 746, 'vol': 1260, 'version': 1242, 'related': 925, 'replace': 937, 'toy': 1197, 'breaker': 149, 'surpassing': 1120, 'chart': 206, 'began': 115, 'reporting': 942, 'part': 824, 'renegotiated': 934, 'multiyear': 739, 'co': 221, 'exclusively': 380, 'hope': 542, 'just': 602, 'suite': 1107, 'broadcast': 155, 'cable': 173, 'those': 1163, 'owned': 812, 'broadcasting': 156, 'read': 913, 'academy': 3, 'award': 92, 'join': 596, 'narrators': 745, 'candlelight': 180, 'processional': 876, 'revealed': 955, 'roster': 971, 'celebrity': 194, 'yea': 1329, 'non': 768, 'collaboration': 223, 'peacemaker': 829, 'dcu': 279, 'nerdist': 749, 'fun': 457, 'trip': 1205, 'into': 581, 'nightmare': 763, 'emma': 351, 'mcguinness': 693, 'recently': 920, 'filed': 419, 'resorts': 949, 'severe': 1020, 'claimed': 215, 'su': 1098, 'yet': 1334, 'support': 1116, 'refers': 923, 'backlash': 96, 'various': 1240, 'actions': 10, 'interviews': 580, 'while': 1296, 'promoting': 887, 'upcoming': 1231, 'remake': 931, 'revolves': 957, 'doing': 321, 'right': 961, 'thing': 1158, 'feige': 410, 'served': 1014, 'president': 871, 'subsidiary': 1101, 'company': 234, 'role': 968, 'guided': 500, 'superhero': 1114, 'powerhouse': 861, 'flag': 432, 'max': 692, 'am': 49, 'apparently': 69, 'else': 348, 'taika': 1128, 'watiti': 1279, 'battlebots': 103, 'chions': 211, 'discovery': 313, 'ninth': 766, 'thanks': 1149, 'domestic': 322, 'licensing': 642, 'agreement': 32, 'warner': 1270, 'bros': 161, 'kick': 606, 'later': 621, 'chances': 198, 'weekend': 1284, 'being': 118, 'overtaken': 808, 'marathon': 683, 'grew': 494, 'continues': 249, 'annual': 61, 'international': 578, 'festivals': 414, 'wraps': 1324, 'food': 439, 'wine': 1305, 'nov': 771, 'holidays': 538, 'runs': 977, 'dec': 287, 'polynesian': 854, 'village': 1253, 'devolved': 302, 'alleged': 43, 'forced': 442, 'room': 970, 'cast': 188, 'members': 701, 'located': 654, 'short': 1031, 'monorail': 726, 'ride': 959, 'ma': 669, 'wonderfully': 1312, 'documentary': 318, 'certain': 197, 'generation': 465, 'holds': 536, 'dear': 283, 'owes': 810, 'debt': 285, 'even': 371, 'never': 754, 'visited': 1258, 'existed': 383, 'sucks': 1103, 'along': 46, 'service': 1015, 'eleven': 347, 'fx': 461, 'freeform': 449, 'starting': 1076, 'reports': 943, 'deadline': 281, 'disne': 314, 'sept': 1010, 'racked': 900, 'billion': 131, 'viewed': 1249, 'across': 8, 'total': 1193, 'turn': 1208, 'dipped': 311, 'staying': 1078, 'expect': 384, 'magical': 674, 'lodging': 655, 'transportation': 1201, 'came': 177, 'unwelcome': 1229, 'souvenir': 1060, 'akihabara': 36, 'tokyo': 1184, 'kotaro': 612, 'takamura': 1129, 'sasebo': 983, 'nagasaki': 744, 'prefecture': 863, 'requested': 944, 'central': 196, 'government': 487, 'quick': 896, 'whether': 1294, 'grant': 490, 'license': 641, 'major': 675, 'walk': 1263, 'scheduled': 991, 'actually': 16, 'alert': 39, 'soft': 1048, 'allears': 42, 'net': 750, 'words': 1315, 'actresses': 15, 'perception': 832, 'arrangement': 75, 'renegotiating': 935, 'weird': 1287, 'morbid': 730, 'autumn': 88, 'variations': 1239, 'singer': 1038, 'exclusive': 379, 'reneg': 933, 'fine': 429, 'jennifer': 593, 'lee': 632, 'told': 1185, 'crowd': 267, 'blown': 137, 'away': 93, 'seen': 1006, 'wide': 1301, 'sandy': 982, 'beaches': 107, 'gentle': 467, 'surf': 1118, 'endless': 356, 'outdoor': 804, 'activities': 12, 'island': 587, 'carolina': 184, 'beach': 106, 'destinations': 300, 'fewer': 417, 'miles': 715, 'packs': 816, 'surprising': 1123, 'amount': 55, 'meet': 699, 'totally': 1194, 'killer': 608, 'fair': 394}
Encoded Document is:
 [[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]
In [ ]:
import numpy as np
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt
/Users/chendong/opt/anaconda3/lib/python3.8/site-packages/pandas/core/computation/expressions.py:20: UserWarning: Pandas requires version '2.7.3' or newer of 'numexpr' (version '2.7.1' currently installed).
  from pandas.core.computation.check import NUMEXPR_INSTALLED
In [ ]:
# read data
x = pd.read_csv("../data/modified-data/Monthly_Stock.csv")
y = pd.read_csv("../data/modified-data/sp500_month.csv")
In [ ]:
## select columns
x = x[['NFLX', 'AAPL', 'JPM']]
y = y[['x']]
In [ ]:
# calculate correlation and plot
df_xy = pd.concat([x, y], axis=1)
sns.heatmap(df_xy.corr('pearson'), annot=True)
sns.pairplot(pd.DataFrame(np.hstack((x,y))))
Out[ ]:
<seaborn.axisgrid.PairGrid at 0x7fda206f8fd0>

Import¶

In [ ]:
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
from sklearn.metrics import accuracy_score
/Users/chendong/opt/anaconda3/lib/python3.8/site-packages/pandas/core/computation/expressions.py:20: UserWarning: Pandas requires version '2.7.3' or newer of 'numexpr' (version '2.7.1' currently installed).
  from pandas.core.computation.check import NUMEXPR_INSTALLED
In [ ]:
df = pd.read_csv("../data/modified-data/cleaned_stock.csv")
df = df[['Open', 'High', 'Low', 'Close', 'Volume', 'Adjusted', 'dn', 'mavg', 'up', 'Stock']]
df['Stock'] = df['Stock'].astype('category').cat.codes
print(df)
# Disney = 0, Paramount = 1, Warner Bros = 2
            Open        High         Low       Close    Volume    Adjusted  \
0     155.830002  157.559998  155.360001  156.759995  10222800  156.759995   
1     158.589996  160.320007  155.550003  155.729996  16582000  155.729996   
2     156.520004  159.380005  155.100006  155.190002  12272100  155.190002   
3     156.240005  157.770004  153.679993  156.899994  11095300  156.899994   
4     156.899994  159.300003  156.289993  157.830002   9554600  157.830002   
...          ...         ...         ...         ...       ...         ...   
1375   10.010000   10.180000    9.720000    9.730000  32193700    9.730000   
1376    9.700000    9.970000    9.570000    9.800000  18682700    9.800000   
1377    9.800000    9.910000    9.530000    9.550000  15150700    9.550000   
1378    9.650000    9.970000    9.630000    9.850000  18128400    9.850000   
1379    9.890000   10.110000    9.800000    9.940000  23004200    9.940000   

              dn        mavg          up  Stock  
0     146.548151  151.926833  157.305514      0  
1     146.628339  152.323833  158.019326      0  
2     146.645134  152.596833  158.548532      0  
3     146.630934  152.776666  158.922399      0  
4     146.492148  153.018166  159.544185      0  
...          ...         ...         ...    ...  
1375    9.913337   10.476000   11.038663      2  
1376    9.808119   10.424167   11.040215      2  
1377    9.703007   10.362333   11.021660      2  
1378    9.632098   10.324167   11.016236      2  
1379    9.591074   10.298500   11.005926      2  

[1380 rows x 10 columns]
In [ ]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report

# Load your dataset here
# Assuming df is your DataFrame

# Select the features and target
X = df[["Adjusted", "dn"]]
y = df["Stock"]  # Replace "Stock" with your actual target column name

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("x_train.shape		:",X_train.shape)
print("y_train.shape		:",y_train.shape)

print("X_test.shape		:",X_test.shape)
print("y_test.shape		:",y_test.shape)
x_train.shape		: (1104, 2)
y_train.shape		: (1104,)
X_test.shape		: (276, 2)
y_test.shape		: (276,)
In [ ]:
# Initialize the MultinomialNB model
model = MultinomialNB()

# Train the model on the training data
model.fit(X_train, y_train)

# Make predictions on the testing data
y_pred = model.predict(X_test)

# Calculate and print the accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Display the classification report for more detailed metrics
report = classification_report(y_test, y_pred)
print("Classification Report:\n", report)
Accuracy: 0.5362318840579711
Classification Report:
               precision    recall  f1-score   support

           0       0.57      0.75      0.65        93
           1       0.00      0.00      0.00        95
           2       0.51      0.89      0.65        88

    accuracy                           0.54       276
   macro avg       0.36      0.55      0.43       276
weighted avg       0.35      0.54      0.42       276

/Users/chendong/opt/anaconda3/lib/python3.8/site-packages/sklearn/metrics/_classification.py:1221: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
In [ ]:
def report(y,ypred):
      #ACCURACY COMPUTE 
      print("Accuracy:",accuracy_score(y, ypred)*100)
      print("Number of mislabeled points out of a total %d points = %d"
            % (y.shape[0], (y != ypred).sum()))

def print_model_summary():
      # LABEL PREDICTIONS FOR TRAINING AND TEST SET 
      yp_train = model.predict(X_train)
      yp_test = model.predict(X_test)

      print("ACCURACY CALCULATION\n")

      print("TRAINING SET:")
      report(y_train,yp_train)

      print("\nTEST SET (UNTRAINED DATA):")
      report(y_test,yp_test)

      print("\nCHECK FIRST 20 PREDICTIONS")
      print("TRAINING SET:")
      print(y_train[0:20])
      print(yp_train[0:20])
      print("ERRORS:",yp_train[0:20]-y_train[0:20])

      print("\nTEST SET (UNTRAINED DATA):")
      print(y_test[0:20])
      print(yp_test[0:20])
      print("ERRORS:",yp_test[0:20]-y_test[0:20])
In [ ]:
print_model_summary()
ACCURACY CALCULATION

TRAINING SET:
Accuracy: 48.731884057971016
Number of mislabeled points out of a total 1104 points = 566

TEST SET (UNTRAINED DATA):
Accuracy: 53.62318840579711
Number of mislabeled points out of a total 276 points = 128

CHECK FIRST 20 PREDICTIONS
TRAINING SET:
695     1
1088    2
1106    2
558     1
494     1
462     1
1074    2
243     0
936     2
654     1
756     1
1309    2
420     0
982     2
54      0
1089    2
783     1
109     0
774     1
571     1
Name: Stock, dtype: int8
[0 2 2 2 0 2 2 0 2 0 0 2 0 0 2 0 2 0 2 0]
ERRORS: 695    -1
1088    0
1106    0
558     1
494    -1
462     1
1074    0
243     0
936     0
654    -1
756    -1
1309    0
420     0
982    -2
54      2
1089   -2
783     1
109     0
774     1
571    -1
Name: Stock, dtype: int8

TEST SET (UNTRAINED DATA):
377     0
548     1
979     2
1149    2
481     1
76      0
67      0
1096    2
584     1
824     1
1052    2
828     1
184     0
989     2
575     1
429     0
361     0
829     1
1239    2
196     0
Name: Stock, dtype: int8
[0 0 2 2 0 0 0 2 2 2 2 2 0 2 0 0 0 2 0 0]
ERRORS: 377     0
548    -1
979     0
1149    0
481    -1
76      0
67      0
1096    0
584     1
824     1
1052    0
828     1
184     0
989     0
575    -1
429     0
361     0
829     1
1239   -2
196     0
Name: Stock, dtype: int8
In [ ]:
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

# Generate a confusion matrix
cm = confusion_matrix(y_test, y_pred)

# Create a heatmap of the confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=['Class 0', 'Class 1', 'Class 2'], yticklabels=['Class 0', 'Class 1', 'Class 2'])
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('Confusion Matrix')
plt.show()
In [ ]:
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc
from sklearn.preprocessing import label_binarize

# Convert labels to a binary form (one-hot encoding)
y_test_bin = label_binarize(y_test, classes=[0, 1, 2]) 

# Initialize the model and train it
model = MultinomialNB()
model.fit(X_train, y_train)

# Get class probabilities
y_prob = model.predict_proba(X_test)

# Compute ROC curve and AUC for each class
n_classes = y_test_bin.shape[1]
fpr = dict()
tpr = dict()
roc_auc = dict()

for i in range(n_classes):
    fpr[i], tpr[i], _ = roc_curve(y_test_bin[:, i], y_prob[:, i])
    roc_auc[i] = auc(fpr[i], tpr[i])

# Plot ROC curves for each class
plt.figure(figsize=(8, 6))
colors = ['b', 'g', 'r', 'c', 'm', 'y', 'k']
for i, color in zip(range(n_classes), colors):
    plt.plot(fpr[i], tpr[i], color=color, lw=2, label=f'ROC curve (area = {roc_auc[i]:.2f})')

plt.plot([0, 1], [0, 1], 'k--', lw=2)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve for Multiclass')
plt.legend(loc="lower right")
plt.show()
In [ ]:
# Define the number of iterations and an array to store accuracy values
n = 100
accuracy_values = []

# Load your data and labels here, replace with your actual data


for iteration in range(1, n + 1):
    # Split the data into training and validation subsets for each iteration
    x_train, x_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=iteration)

    # Initialize the MultinomialNB model
    model = MultinomialNB()

    # Train the MultinomialNB model on the current subset of training data
    model.fit(x_train, y_train)

    # Make predictions on the validation data
    y_pred = model.predict(x_valid)

    # Calculate and store the accuracy for this time
    accuracy = accuracy_score(y_valid, y_pred)
    accuracy_values.append(accuracy)

# Plot the accuracy values
plt.figure(figsize=(10, 6))
plt.plot(range(1, n + 1), accuracy_values, marker='o')
plt.title('Accuracy')
plt.xlabel('Times')
plt.ylabel('Accuracy')
plt.grid()
plt.show()
In [ ]:
n = 1000

# Store accuracy values in a list
accuracy_values = []

for _ in range(n):
    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=np.random.randint(100))

    # Train the model on the training data
    model.fit(X_train, y_train)

    # Make predictions on the testing data
    y_pred = model.predict(X_test)

    # Calculate and store the accuracy
    accuracy = accuracy_score(y_test, y_pred)
    accuracy_values.append(accuracy)

# Plot the distribution of accuracy values
plt.figure(figsize=(10, 6))
plt.hist(accuracy_values, bins=20, edgecolor='black')
plt.title('Distribution of Accuracy')
plt.xlabel('Accuracy')
plt.ylabel('Frequency')
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.show()

Import¶

In [ ]:
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
from sklearn.metrics import accuracy_score
/Users/chendong/opt/anaconda3/lib/python3.8/site-packages/pandas/core/computation/expressions.py:20: UserWarning: Pandas requires version '2.7.3' or newer of 'numexpr' (version '2.7.1' currently installed).
  from pandas.core.computation.check import NUMEXPR_INSTALLED

Read and re-format¶

In [ ]:
#RELOAD FILE AND PRETEND THAT IS OUR STARTING POINT 
df=pd.read_csv('../data/raw-data/text.csv')  
print(df.shape)

#CONVERT FROM STRING LABELS TO INTEGERS 
labels=[]; #y1=[]; y2=[]
y1=[]
for label in df["Label"]:
    if label not in labels:
        labels.append(label)
        print("index =",len(labels)-1,": label =",label)
    for i in range(0,len(labels)):
        if(label==labels[i]):
            y1.append(i)
y1=np.array(y1)

# CONVERT DF TO LIST OF STRINGS 
corpus=df["Title"].to_list()
y2=df["Label"].to_numpy()

print("number of text chunks = ",len(corpus))
print(corpus[0:3])
(4998, 2)
index = 0 : label = Netflix
index = 1 : label = Hulu
index = 2 : label = Prime Video
index = 3 : label = Disney+
number of text chunks =  4998
['Breaking Bad', 'Stranger Things', 'Better Call Saul']

Vectorize the text data¶

In [ ]:
# INITIALIZE COUNT VECTORIZER
# minDF = 0.01 means "ignore terms that appear in less than 1% of the documents". 
# minDF = 5 means "ignore terms that appear in less than 5 documents".
vectorizer=CountVectorizer(min_df=0.001)   

# RUN COUNT VECTORIZER ON OUR COURPUS 
Xs  =  vectorizer.fit_transform(corpus)   
X=np.array(Xs.todense())

#CONVERT TO ONE-HOT VECTORS
maxs=np.max(X,axis=0)
X=np.ceil(X/maxs)

# DOUBLE CHECK 
print(X.shape,y1.shape,y2.shape)
print("DATA POINT-0:",X[0,0:10],"y1 =",y1[0],"  y2 =",y2[0])
(4998, 477) (4998,) (4998,)
DATA POINT-0: [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.] y1 = 0   y2 = Netflix

Partition Data¶

Assignment 3.2.4: Break data into an 80-20 training/test set

As a sanity check, reprint the shapes to make sure everything is correct

x_train.shape		: (120, 4)
y_train.shape		: (120,)
X_test.shape		: (30, 4)
y_test.shape		: (30,)
In [ ]:
# BEFORE SPLIT
print(y1[1000:1200])
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
In [ ]:
# INSERT CODE TO PARTITION DATASET INTO TRAINING-TEST

from sklearn.model_selection import train_test_split
test_ratio=0.2

# SPLIT ARRAYS OR MATRICES INTO RANDOM TRAIN AND TEST SUBSETS.
x_train, x_test, y_train, y_test = train_test_split(X, y1, test_size=test_ratio, random_state=0)
y_train=y_train.flatten()
y_test=y_test.flatten()

print("x_train.shape		:",x_train.shape)
print("y_train.shape		:",y_train.shape)

print("X_test.shape		:",x_test.shape)
print("y_test.shape		:",y_test.shape)
x_train.shape		: (3998, 477)
y_train.shape		: (3998,)
X_test.shape		: (1000, 477)
y_test.shape		: (1000,)
In [ ]:
#CHECK TO MAKE SURE IT WAS RANDOMIZED 
print(y_train[0:100])
[3 0 2 1 0 2 3 2 2 2 2 2 2 1 0 0 0 1 2 2 2 0 1 1 2 1 1 2 2 3 2 2 1 0 3 3 1
 2 2 0 0 1 1 1 0 1 2 2 0 1 0 2 2 2 2 3 0 1 3 2 2 0 0 0 2 1 0 2 1 0 1 0 0 1
 2 1 0 2 1 2 0 1 0 0 1 2 2 2 2 1 0 2 1 2 0 1 0 3 2 2]

Utility function¶

  • Write a function to report accuracy
  • Note this will act on object stored in pythons global scope. Therefore as long as everything is named the same you can recycle it for multiple models
In [ ]:
def report(y,ypred):
      #ACCURACY COMPUTE 
      print("Accuracy:",accuracy_score(y, ypred)*100)
      print("Number of mislabeled points out of a total %d points = %d"
            % (y.shape[0], (y != ypred).sum()))

def print_model_summary():
      # LABEL PREDICTIONS FOR TRAINING AND TEST SET 
      yp_train = model.predict(x_train)
      yp_test = model.predict(x_test)

      print("ACCURACY CALCULATION\n")

      print("TRAINING SET:")
      report(y_train,yp_train)

      print("\nTEST SET (UNTRAINED DATA):")
      report(y_test,yp_test)

      print("\nCHECK FIRST 20 PREDICTIONS")
      print("TRAINING SET:")
      print(y_train[0:20])
      print(yp_train[0:20])
      print("ERRORS:",yp_train[0:20]-y_train[0:20])

      print("\nTEST SET (UNTRAINED DATA):")
      print(y_test[0:20])
      print(yp_test[0:20])
      print("ERRORS:",yp_test[0:20]-y_test[0:20])

Classification model-1: Multinomial Naive Bayes¶

The following code applies the multi-nomial Naive Bayes classifier to the text data-set generated with wikipeda

Train model¶

  • Use SkLearn to train a MultinomialNB model
  • When the model is trained, insert code to output the following information about the training and test set (your numbers will vary)
    • Remember that the test set was NOT seen during the training process, and therefore "test" predictions show how the model does on new "unseen" data
In [ ]:
from sklearn.naive_bayes import MultinomialNB

# INITIALIZE MODEL 
model = MultinomialNB()

# TRAIN MODEL 
model.fit(x_train,y_train)

# PRINT REPORT USING UTILITY FUNCTION ABOVE
print_model_summary()
ACCURACY CALCULATION

TRAINING SET:
Accuracy: 48.049024512256125
Number of mislabeled points out of a total 3998 points = 2077

TEST SET (UNTRAINED DATA):
Accuracy: 41.099999999999994
Number of mislabeled points out of a total 1000 points = 589

CHECK FIRST 20 PREDICTIONS
TRAINING SET:
[3 0 2 1 0 2 3 2 2 2 2 2 2 1 0 0 0 1 2 2]
[3 0 1 0 2 0 0 2 0 2 2 2 0 0 0 0 0 1 2 0]
ERRORS: [ 0  0 -1 -1  2 -2 -3  0 -2  0  0  0 -2 -1  0  0  0  0  0 -2]

TEST SET (UNTRAINED DATA):
[2 1 0 0 1 0 0 2 2 0 3 0 2 2 0 1 3 1 0 0]
[2 1 0 0 1 0 0 0 0 0 0 0 2 0 0 0 0 0 0 0]
ERRORS: [ 0  0  0  0  0  0  0 -2 -2  0 -3  0  0 -2  0 -1 -3 -1  0  0]
In [ ]:
from sklearn.metrics import classification_report, confusion_matrix
import seaborn as sns

# Generate a classification report
report = classification_report(y_test, y_pred, target_names=['Class 0', 'Class 1', 'Class 2', 'Class 3'])
print("Classification Report:\n", report)

# Generate a confusion matrix
cm = confusion_matrix(y_test, y_pred)

# Create a heatmap of the confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=['Class 0', 'Class 1', 'Class 2', 'Class 3'], yticklabels=['Class 0', 'Class 1', 'Class 2', 'Class 3'])
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('Confusion Matrix')
plt.show()
Classification Report:
               precision    recall  f1-score   support

     Class 0       0.45      0.79      0.57       366
     Class 1       0.50      0.26      0.34       259
     Class 2       0.60      0.38      0.46       322
     Class 3       0.88      0.28      0.43        53

    accuracy                           0.49      1000
   macro avg       0.61      0.43      0.45      1000
weighted avg       0.53      0.49      0.47      1000

In [ ]:
from sklearn.metrics import roc_curve, auc
from sklearn.preprocessing import label_binarize

# Convert labels to a binary form (one-hot encoding)
y_test_bin = label_binarize(y_test, classes=[0, 1, 2, 3])

# Initialize the model and train it
model = MultinomialNB()
model.fit(x_train, y_train)

# Get class probabilities
y_prob = model.predict_proba(x_test)

# Compute ROC curve and AUC for each class
n_classes = y_test_bin.shape[1]
fpr = dict()
tpr = dict()
roc_auc = dict()

for i in range(n_classes):
    fpr[i], tpr[i], _ = roc_curve(y_test_bin[:, i], y_prob[:, i])
    roc_auc[i] = auc(fpr[i], tpr[i])

# Plot ROC curves for each class
plt.figure(figsize=(8, 6))
colors = ['b', 'g', 'r', 'c', 'm', 'y', 'k']
for i, color in zip(range(n_classes), colors):
    plt.plot(fpr[i], tpr[i], color=color, lw=2, label=f'ROC curve (area = {roc_auc[i]:.2f})')

plt.plot([0, 1], [0, 1], 'k--', lw=2)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve for Multiclass')
plt.legend(loc="lower right")
plt.show()
In [ ]:
# Define the number of times and an array to store accuracy values
n = 100
accuracy_values = []

# Load your data and labels here, replace with your actual data


for iteration in range(1, n + 1):
    # Split the data into training and validation subsets for each iteration
    x_train, x_valid, y_train, y_valid = train_test_split(X, y1, test_size=0.2, random_state=iteration)

    # Initialize the MultinomialNB model
    model = MultinomialNB()

    # Train the MultinomialNB model on the current subset of training data
    model.fit(x_train, y_train)

    # Make predictions on the validation data
    y_pred = model.predict(x_valid)

    # Calculate and store the accuracy for this times
    accuracy = accuracy_score(y_valid, y_pred)
    accuracy_values.append(accuracy)

# Plot the accuracy values 
plt.figure(figsize=(10, 6))
plt.plot(range(1, n + 1), accuracy_values, marker='o')
plt.title('Accuracy')
plt.xlabel('Times')
plt.ylabel('Accuracy')
plt.grid()
plt.show()
In [ ]:
n = 1000

# Store accuracy values in a list
accuracy_values = []

for _ in range(n):
    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y1, test_size=0.2, random_state=np.random.randint(100))

    # Train the model on the training data
    model.fit(X_train, y_train)

    # Make predictions on the testing data
    y_pred = model.predict(X_test)

    # Calculate and store the accuracy
    accuracy = accuracy_score(y_test, y_pred)
    accuracy_values.append(accuracy)

# Plot the distribution of accuracy values
plt.figure(figsize=(10, 6))
plt.hist(accuracy_values, bins=20, edgecolor='black')
plt.title('Distribution of Accuracy')
plt.xlabel('Accuracy')
plt.ylabel('Frequency')
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.show()

format: html: embed-resources: true code-fold: true


In [ ]:
# import the necessary packages
import numpy as np
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt
In [ ]:
# read data
df = pd.read_csv("../data/modified-data/stock_cluster.csv")
df.head()
Out[ ]:
Unnamed: 0 Date Open High Low Close Volume Adjusted dn mavg up Stock direction
0 2022-01-03 2022-01-03 605.609985 609.989990 590.559998 597.369995 3067500 597.369995 587.291452 607.250833 627.210215 NFLX Decreasing
1 2022-01-04 2022-01-04 599.909973 600.409973 581.599976 591.150024 4393100 591.150024 585.186408 606.287166 627.387925 NFLX Decreasing
2 2022-01-05 2022-01-05 592.000000 592.840027 566.880005 567.520020 4148700 567.520020 580.284351 603.976666 627.668982 NFLX Decreasing
3 2022-01-06 2022-01-06 554.340027 563.359985 542.010010 553.289978 5711800 553.289978 570.018038 600.225332 630.432625 NFLX Decreasing
4 2022-01-07 2022-01-07 549.460022 553.429993 538.219971 541.059998 3382900 541.059998 558.782412 596.575831 634.369250 NFLX Decreasing
In [ ]:
# define X and Y
from sklearn.preprocessing import StandardScaler

X = df[['Adjusted', 'Volume']]
print(X.head())
X = StandardScaler().fit_transform(X)

Y = df[['Stock']]
print(Y.head())
     Adjusted   Volume
0  597.369995  3067500
1  591.150024  4393100
2  567.520020  4148700
3  553.289978  5711800
4  541.059998  3382900
  Stock
0  NFLX
1  NFLX
2  NFLX
3  NFLX
4  NFLX
In [ ]:
# visualization
plt.figure(figsize=(10, 6))
sns.scatterplot(data=df, x='Adjusted', y='Volume', hue='Stock')
plt.xlabel('Adjusted')
plt.ylabel('Volume')
plt.title('Cluster Plot')
plt.legend(loc = "upper right", title = "Stock")
plt.show()
In [ ]:
# import relevent libraries for clustering. we will use KMeans, AgglomerativeClustering, MeanShift, Birch, and DBSCAN
from sklearn.cluster import KMeans, AgglomerativeClustering, DBSCAN
In [ ]:
import sklearn.cluster

# THIS WILL ITERATE OVER ONE HYPER-PARAMETER (GRID SEARCH) 
# AND RETURN THE CLUSTER RESULT THAT OPTIMIZES THE SILHOUETTE SCORE
def maximize_silhouette(X,algo="birch",nmax=20,i_plot=False):

    # PARAM
    i_print=False

    #FORCE CONTIGUOUS
    X=np.ascontiguousarray(X) 

    # LOOP OVER HYPER-PARAM
    params=[]; sil_scores=[]
    sil_max=-10
    for param in range(2,nmax+1):
        if(algo=="birch"):
            model = sklearn.cluster.Birch(n_clusters=param).fit(X)
            labels=model.predict(X)

        if(algo=="ag"):
            model = sklearn.cluster.AgglomerativeClustering(n_clusters=param).fit(X)
            labels=model.labels_

        if(algo=="dbscan"):
            param=0.05*(param-1)
            model = sklearn.cluster.DBSCAN(eps=param).fit(X)
            labels=model.labels_

        if(algo=="kmeans"):
            model = sklearn.cluster.KMeans(n_clusters=param).fit(X)
            labels=model.predict(X)
        
        if(algo=="meanshift"):
            model = sklearn.cluster.MeanShift(bandwidth=param).fit(X)
            labels=model.labels_

        try:
            sil_scores.append(sklearn.metrics.silhouette_score(X,labels))
            params.append(param)
        except:
            continue 

        if(i_print): print(param,sil_scores[-1])
        
        if(sil_scores[-1]>sil_max):
             opt_param=param
             sil_max=sil_scores[-1]
             opt_labels=labels
    print("Algorithm = ", algo)
    print("OPTIMAL PARAMETER =",opt_param)

    if(i_plot):
        fig, ax = plt.subplots()
        ax.plot(params, sil_scores, "-o")  
        ax.set(xlabel='Hyper-parameter', ylabel='Silhouette')
        plt.show()

    return opt_labels
In [ ]:
# AGGLOMERATIVE CLUSTERING
opt_labels=maximize_silhouette(X,algo="ag",nmax=15, i_plot=True)
# DBSCAN
opt_labels=maximize_silhouette(X,algo="dbscan",nmax=15, i_plot=True)
# BRICH
opt_labels=maximize_silhouette(X,algo="birch",nmax=15, i_plot=True)
# KMEANS
opt_labels=maximize_silhouette(X,algo="kmeans",nmax=15, i_plot=True)
Algorithm =  ag
OPTIMAL PARAMETER = 3
Algorithm =  dbscan
OPTIMAL PARAMETER = 0.35000000000000003
Algorithm =  birch
OPTIMAL PARAMETER = 4
/Users/chendong/opt/anaconda3/lib/python3.8/site-packages/sklearn/cluster/_birch.py:725: ConvergenceWarning: Number of subclusters found (11) by BIRCH is less than (12). Decrease the threshold.
  warnings.warn(
/Users/chendong/opt/anaconda3/lib/python3.8/site-packages/sklearn/cluster/_birch.py:725: ConvergenceWarning: Number of subclusters found (11) by BIRCH is less than (13). Decrease the threshold.
  warnings.warn(
/Users/chendong/opt/anaconda3/lib/python3.8/site-packages/sklearn/cluster/_birch.py:725: ConvergenceWarning: Number of subclusters found (11) by BIRCH is less than (14). Decrease the threshold.
  warnings.warn(
/Users/chendong/opt/anaconda3/lib/python3.8/site-packages/sklearn/cluster/_birch.py:725: ConvergenceWarning: Number of subclusters found (11) by BIRCH is less than (15). Decrease the threshold.
  warnings.warn(
/Users/chendong/opt/anaconda3/lib/python3.8/site-packages/sklearn/cluster/_kmeans.py:1416: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  super()._check_params_vs_input(X, default_n_init=10)
/Users/chendong/opt/anaconda3/lib/python3.8/site-packages/sklearn/cluster/_kmeans.py:1416: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  super()._check_params_vs_input(X, default_n_init=10)
/Users/chendong/opt/anaconda3/lib/python3.8/site-packages/sklearn/cluster/_kmeans.py:1416: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  super()._check_params_vs_input(X, default_n_init=10)
/Users/chendong/opt/anaconda3/lib/python3.8/site-packages/sklearn/cluster/_kmeans.py:1416: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  super()._check_params_vs_input(X, default_n_init=10)
/Users/chendong/opt/anaconda3/lib/python3.8/site-packages/sklearn/cluster/_kmeans.py:1416: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  super()._check_params_vs_input(X, default_n_init=10)
/Users/chendong/opt/anaconda3/lib/python3.8/site-packages/sklearn/cluster/_kmeans.py:1416: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  super()._check_params_vs_input(X, default_n_init=10)
/Users/chendong/opt/anaconda3/lib/python3.8/site-packages/sklearn/cluster/_kmeans.py:1416: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  super()._check_params_vs_input(X, default_n_init=10)
/Users/chendong/opt/anaconda3/lib/python3.8/site-packages/sklearn/cluster/_kmeans.py:1416: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  super()._check_params_vs_input(X, default_n_init=10)
/Users/chendong/opt/anaconda3/lib/python3.8/site-packages/sklearn/cluster/_kmeans.py:1416: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  super()._check_params_vs_input(X, default_n_init=10)
/Users/chendong/opt/anaconda3/lib/python3.8/site-packages/sklearn/cluster/_kmeans.py:1416: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  super()._check_params_vs_input(X, default_n_init=10)
/Users/chendong/opt/anaconda3/lib/python3.8/site-packages/sklearn/cluster/_kmeans.py:1416: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  super()._check_params_vs_input(X, default_n_init=10)
/Users/chendong/opt/anaconda3/lib/python3.8/site-packages/sklearn/cluster/_kmeans.py:1416: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  super()._check_params_vs_input(X, default_n_init=10)
/Users/chendong/opt/anaconda3/lib/python3.8/site-packages/sklearn/cluster/_kmeans.py:1416: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  super()._check_params_vs_input(X, default_n_init=10)
/Users/chendong/opt/anaconda3/lib/python3.8/site-packages/sklearn/cluster/_kmeans.py:1416: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  super()._check_params_vs_input(X, default_n_init=10)
Algorithm =  kmeans
OPTIMAL PARAMETER = 3
In [ ]:
## kmeans
from scipy.spatial.distance import cdist
distortions = []
inertias = []

for i in range(1, 11):
    # Initialize and fit the KMeans model
    kmeans = KMeans(n_clusters = i)
    kmeans.fit(X)
    
    # Calculate distortion and inertia
    inertias.append(kmeans.inertia_)
    distortions.append(sum(np.min(cdist(X, kmeans.cluster_centers_, 'euclidean'),axis=1)) / X.shape[0])

# Create a DataFrame to store the data
data = pd.DataFrame({'Cluster': range(1, 11), 'Distortion': distortions, 'Inertia': inertias})

data
/Users/chendong/opt/anaconda3/lib/python3.8/site-packages/sklearn/cluster/_kmeans.py:1416: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  super()._check_params_vs_input(X, default_n_init=10)
/Users/chendong/opt/anaconda3/lib/python3.8/site-packages/sklearn/cluster/_kmeans.py:1416: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  super()._check_params_vs_input(X, default_n_init=10)
/Users/chendong/opt/anaconda3/lib/python3.8/site-packages/sklearn/cluster/_kmeans.py:1416: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  super()._check_params_vs_input(X, default_n_init=10)
/Users/chendong/opt/anaconda3/lib/python3.8/site-packages/sklearn/cluster/_kmeans.py:1416: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  super()._check_params_vs_input(X, default_n_init=10)
/Users/chendong/opt/anaconda3/lib/python3.8/site-packages/sklearn/cluster/_kmeans.py:1416: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  super()._check_params_vs_input(X, default_n_init=10)
/Users/chendong/opt/anaconda3/lib/python3.8/site-packages/sklearn/cluster/_kmeans.py:1416: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  super()._check_params_vs_input(X, default_n_init=10)
/Users/chendong/opt/anaconda3/lib/python3.8/site-packages/sklearn/cluster/_kmeans.py:1416: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  super()._check_params_vs_input(X, default_n_init=10)
/Users/chendong/opt/anaconda3/lib/python3.8/site-packages/sklearn/cluster/_kmeans.py:1416: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  super()._check_params_vs_input(X, default_n_init=10)
/Users/chendong/opt/anaconda3/lib/python3.8/site-packages/sklearn/cluster/_kmeans.py:1416: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  super()._check_params_vs_input(X, default_n_init=10)
/Users/chendong/opt/anaconda3/lib/python3.8/site-packages/sklearn/cluster/_kmeans.py:1416: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  super()._check_params_vs_input(X, default_n_init=10)
Out[ ]:
Cluster Distortion Inertia
0 1 1.268869 2862.000000
1 2 0.855398 1407.136470
2 3 0.463472 493.295513
3 4 0.391171 332.985543
4 5 0.330239 237.692843
5 6 0.292370 183.698590
6 7 0.255055 138.253996
7 8 0.238272 119.543276
8 9 0.227034 101.652802
9 10 0.212766 89.814633
In [ ]:
# plot distortion and inertia for kmeans, you can either plot them seperately or use fig, ax = plt.subplots(1, 2) to plot them in the same figure. Suggest the optimal number of clusters based on the plot.

# Create two subplots, sharing the x-axis
fig, (ax1, ax2) = plt.subplots(2, 1, sharex=True)

# Plot Distortion in the top subplot
ax1.plot(data['Cluster'], data['Distortion'], linestyle='-', label='Distortion', color='green')

# Plot Inertia in the bottom subplot
ax2.plot(data['Cluster'], data['Inertia'], linestyle='-', label='Inertia', color='orange')
ax2.set_xlabel('Cluster')

# Add a legend to both subplots
ax1.legend()
ax2.legend()
plt.show()
In [ ]:
# Perform kmeans
kmeans = KMeans(n_clusters = 3)
cluster_labels = kmeans.fit_predict(X)

# Add the cluster labels to your DataFrame
df['cluster_labels'] = cluster_labels

# Visualize the clusters
plt.figure(figsize=(10, 6))

# Plot the data points with different colors for each cluster
for cluster_label in df['cluster_labels'].unique():
    cluster_data = df[df['cluster_labels'] == cluster_label]
    plt.scatter(
        cluster_data['Adjusted'],
        cluster_data['Volume'],
        label=f'Cluster {cluster_label}',
    )

plt.xlabel('Adjusted')
plt.ylabel('Volume')
plt.title('Kmeans Clustering')
plt.legend()
plt.show()
/Users/chendong/opt/anaconda3/lib/python3.8/site-packages/sklearn/cluster/_kmeans.py:1416: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  super()._check_params_vs_input(X, default_n_init=10)
In [ ]:
from sklearn.metrics import silhouette_samples, silhouette_score

# Calculate silhouette scores
silhouette_vals = silhouette_samples(X, cluster_labels)
silhouette_avg = silhouette_score(X, cluster_labels)

# Create Silhouette plot
plt.figure(figsize=(10, 6))
y_lower = 10

for i in range(3):
    ith_cluster_silhouette_vals = silhouette_vals[cluster_labels == i]
    ith_cluster_silhouette_vals.sort()

    size_cluster_i = ith_cluster_silhouette_vals.shape[0]
    y_upper = y_lower + size_cluster_i

    color = plt.rcParams['axes.prop_cycle'].by_key()['color'][i]
    plt.fill_betweenx(np.arange(y_lower, y_upper), 0, ith_cluster_silhouette_vals, facecolor=color, edgecolor=color, alpha=0.7)

    # Label the silhouette plots with their cluster numbers at the middle
    plt.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i))

    # Compute the new y_lower for the next plot
    y_lower = y_upper + 10  # 10 for the 0 samples

plt.title('Silhouette Plot for K-Means Clustering')
plt.xlabel('Silhouette coefficient values')
plt.ylabel('Cluster label')

# The vertical line for average silhouette score of all the values
plt.axvline(x=silhouette_avg, color="red", linestyle="--", linewidth=2)

plt.yticks([])  # Clear the yaxis labels / ticks
plt.xticks([-0.1, 0, 0.2, 0.4, 0.6, 0.8, 1])
plt.show()
In [ ]:
# perform DBSCAN clustering. use the eps and min_samples parameters to find the optimal number of clusters. plot the number of clusters vs the silhouette score. Suggest the optimal number of clusters based on the plot.
from sklearn.metrics import silhouette_score

# Initialize an empty list to store silhouette scores
silhouette_scores = []

# Define a range of values for eps and min_samples
eps_range = np.linspace(0.05, 1.0, num=20)
min_samples_range = range(1, 11)

# Perform DBSCAN clustering for different combinations of eps and min_samples
for eps in eps_range:
    for min_samples in min_samples_range:
        dbscan = DBSCAN(eps=eps, min_samples=min_samples)
        labels = dbscan.fit_predict(X)  # Replace X with your data
        num_clusters = len(np.unique(labels)) - 1  # Subtract 1 to account for noise points
        if num_clusters > 1:  # Ensure that at least two clusters are formed
            silhouette = silhouette_score(X, labels)
            silhouette_scores.append((eps, min_samples, num_clusters, silhouette))

# Convert the list of scores to a NumPy array for easier manipulation
silhouette_scores = np.array(silhouette_scores)

# Plot the number of clusters vs. silhouette score
plt.figure(figsize=(10, 6))
plt.scatter(silhouette_scores[:, 2], silhouette_scores[:, 3], c=silhouette_scores[:, 0], cmap='viridis', s=50)
plt.colorbar(label='Epsilon (eps)')
plt.xlabel('Number of Clusters')
plt.ylabel('Silhouette Score')
plt.title('Number of Clusters vs. Silhouette Score for DBSCAN')
#plt.grid(True)

# Find the combination of eps and min_samples with the highest silhouette score
best_score_idx = np.argmax(silhouette_scores[:, 3])
best_eps = silhouette_scores[best_score_idx, 0]
best_min_samples = silhouette_scores[best_score_idx, 1]
best_num_clusters = silhouette_scores[best_score_idx, 2]
best_silhouette_score = silhouette_scores[best_score_idx, 3]

print(f"Best combination: eps = {best_eps}, min_samples = {best_min_samples}")
print(f"Number of clusters: {best_num_clusters}")
print(f"Best Silhouette Score: {best_silhouette_score}")

plt.show()
Best combination: eps = 0.1, min_samples = 9.0
Number of clusters: 4.0
Best Silhouette Score: 0.4658880301489884
In [ ]:
# Perform DBSCAN
dbscan = DBSCAN(eps=0.1, min_samples=9)
cluster_labels = dbscan.fit_predict(X)

# Add the cluster labels to your DataFrame
df['cluster_labels'] = cluster_labels

# Visualize the clusters
plt.figure(figsize=(10, 6))

# Plot the data points with different colors for each cluster
for cluster_label in df['cluster_labels'].unique():
    cluster_data = df[df['cluster_labels'] == cluster_label]
    plt.scatter(
        cluster_data['Adjusted'],
        cluster_data['Volume'],
        label=f'Cluster {cluster_label}',
    )
plt.xlabel('Adjusted')
plt.ylabel('Volume')
plt.title('DBSCAN Clustering')
plt.legend()
plt.show()
In [ ]:
# Calculate silhouette scores
silhouette_vals = silhouette_samples(X, cluster_labels)
silhouette_avg = silhouette_score(X, cluster_labels)

# Create Silhouette plot
plt.figure(figsize=(10, 6))
y_lower = 10

for i in range(-1, 4):
    ith_cluster_silhouette_vals = silhouette_vals[cluster_labels == i]
    ith_cluster_silhouette_vals.sort()

    size_cluster_i = ith_cluster_silhouette_vals.shape[0]
    y_upper = y_lower + size_cluster_i

    color = plt.rcParams['axes.prop_cycle'].by_key()['color'][i+1]
    plt.fill_betweenx(np.arange(y_lower, y_upper), 0, ith_cluster_silhouette_vals, facecolor=color, edgecolor=color, alpha=0.7)

    # Label the silhouette plots with their cluster numbers at the middle
    plt.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i))

    # Compute the new y_lower for the next plot
    y_lower = y_upper + 10  # 10 for the 0 samples

plt.title('Silhouette Plot for K-Means Clustering')
plt.xlabel('Silhouette coefficient values')
plt.ylabel('Cluster label')

# The vertical line for average silhouette score of all the values
plt.axvline(x=silhouette_avg, color="red", linestyle="--", linewidth=2)

plt.yticks([])  # Clear the yaxis labels / ticks
plt.xticks([-0.1, 0, 0.2, 0.4, 0.6, 0.8, 1])
plt.show()
In [ ]:
# Perform Hierarchical clustering
agg_clustering = AgglomerativeClustering(n_clusters=3)  # Specify the number of clusters
cluster_labels = agg_clustering.fit_predict(X)

# Add the cluster labels to your DataFrame
df['cluster_labels'] = cluster_labels

# Visualize the clusters
plt.figure(figsize=(10, 6))

# Plot the data points with different colors for each cluster
for cluster_label in df['cluster_labels'].unique():
    cluster_data = df[df['cluster_labels'] == cluster_label]
    plt.scatter(
        cluster_data['Adjusted'],
        cluster_data['Volume'],
        label=f'Cluster {cluster_label}',
    )

plt.xlabel('Adjusted')
plt.ylabel('Volume')
plt.title('Hierarchical Clustering')
plt.legend()
plt.show()
In [ ]:
# create linkage for agglomerative clustering, and the dendrogram for the linkage. Suggest the optimal number of clusters based on the dendrogram.
from scipy.cluster.hierarchy import dendrogram, linkage

# Calculate the linkage matrix
linkage_matrix = linkage(X, method='ward')  # You can choose a different linkage method

# Create and visualize the dendrogram
plt.figure(figsize=(10, 6))
dendrogram(linkage_matrix, truncate_mode='level')
plt.axhline(y=20, color='r', linestyle='--')
plt.show()
In [ ]:
# Calculate silhouette scores
silhouette_vals = silhouette_samples(X, cluster_labels)
silhouette_avg = silhouette_score(X, cluster_labels)

# Create Silhouette plot
plt.figure(figsize=(10, 6))
y_lower = 10

for i in range(3):
    ith_cluster_silhouette_vals = silhouette_vals[cluster_labels == i]
    ith_cluster_silhouette_vals.sort()

    size_cluster_i = ith_cluster_silhouette_vals.shape[0]
    y_upper = y_lower + size_cluster_i

    color = plt.rcParams['axes.prop_cycle'].by_key()['color'][i]
    plt.fill_betweenx(np.arange(y_lower, y_upper), 0, ith_cluster_silhouette_vals, facecolor=color, edgecolor=color, alpha=0.7)

    # Label the silhouette plots with their cluster numbers at the middle
    plt.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i))

    # Compute the new y_lower for the next plot
    y_lower = y_upper + 10  # 10 for the 0 samples

plt.title('Silhouette Plot for Hierarchical Clustering')
plt.xlabel('Silhouette coefficient values')
plt.ylabel('Cluster label')

# The vertical line for average silhouette score of all the values
plt.axvline(x=silhouette_avg, color="red", linestyle="--", linewidth=2)

plt.yticks([])  # Clear the yaxis labels / ticks
plt.xticks([-0.1, 0, 0.2, 0.4, 0.6, 0.8, 1])
plt.show()
In [ ]:
 
In [ ]:
 

format: html: embed-resources: true code-fold: true


In [ ]:
import pandas as pd
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.manifold import TSNE
In [ ]:
# read data
df = pd.read_csv("../data/modified-data/stock_cluster.csv")
df.head()
Out[ ]:
Unnamed: 0 Date Open High Low Close Volume Adjusted dn mavg up Stock direction
0 2022-01-03 2022-01-03 605.609985 609.989990 590.559998 597.369995 3067500 597.369995 587.291452 607.250833 627.210215 NFLX Decreasing
1 2022-01-04 2022-01-04 599.909973 600.409973 581.599976 591.150024 4393100 591.150024 585.186408 606.287166 627.387925 NFLX Decreasing
2 2022-01-05 2022-01-05 592.000000 592.840027 566.880005 567.520020 4148700 567.520020 580.284351 603.976666 627.668982 NFLX Decreasing
3 2022-01-06 2022-01-06 554.340027 563.359985 542.010010 553.289978 5711800 553.289978 570.018038 600.225332 630.432625 NFLX Decreasing
4 2022-01-07 2022-01-07 549.460022 553.429993 538.219971 541.059998 3382900 541.059998 558.782412 596.575831 634.369250 NFLX Decreasing
In [ ]:
# define X
X = df[['Open', 'High','Low', 'Close', 'Volume', 'Adjusted', 'dn', 'mavg', 'up']]
print(X.head())
         Open        High         Low       Close   Volume    Adjusted  \
0  605.609985  609.989990  590.559998  597.369995  3067500  597.369995   
1  599.909973  600.409973  581.599976  591.150024  4393100  591.150024   
2  592.000000  592.840027  566.880005  567.520020  4148700  567.520020   
3  554.340027  563.359985  542.010010  553.289978  5711800  553.289978   
4  549.460022  553.429993  538.219971  541.059998  3382900  541.059998   

           dn        mavg          up  
0  587.291452  607.250833  627.210215  
1  585.186408  606.287166  627.387925  
2  580.284351  603.976666  627.668982  
3  570.018038  600.225332  630.432625  
4  558.782412  596.575831  634.369250  
In [ ]:
# pair plot
sns.pairplot(X)
plt.show()
In [ ]:
plt.figure(figsize=(10, 6))
sns.heatmap(X.corr(), annot=True)
plt.title('Correlation Heatmap')
plt.show()
In [ ]:
pca = PCA()
pca.fit(X)

plt.figure(figsize=(10, 6))
plt.plot(range(1, len(pca.explained_variance_ratio_) + 1), 
         pca.explained_variance_ratio_.cumsum(), marker='o')
plt.title('Explained Variance')
plt.xlabel('Number of Principal Components')
plt.ylabel('Cumulative Explained Variance')
plt.show()
In [ ]:
# PCA
X = StandardScaler().fit_transform(X)
pca = PCA(n_components=2)   # 2 dimension
pca.fit(X)                 
newX=pca.fit_transform(X)  
print(pca.explained_variance_ratio_)  
print(newX)
[0.89109103 0.09554449]
[[10.85433149  0.68579939]
 [10.71167546  0.70664221]
 [10.41984188  0.66110329]
 ...
 [-1.50904694 -0.95411545]
 [-1.48990881 -1.00794026]
 [-1.47147694 -1.05500015]]
In [ ]:
# Create a scatter plot of the first two attributes of the original data
plt.figure(figsize=(10, 6))

# Perform PCA
pca = PCA(n_components=2)
newX = pca.fit_transform(X)

plt.scatter(newX[:, 0], newX[:, 1], alpha=0.8)
plt.title('PCA Visualization')

# Plotting the principal components
for i, (x, y) in enumerate(pca.components_.T):
    plt.arrow(0, 0, x, y, color='r', alpha=0.8)
    plt.text(x, y, f'Attribute_{i+1}', color='g', fontsize=8)

plt.show()
In [ ]:
# Create a scatter plot of the original data
plt.figure(figsize=(15, 5))
plt.subplot(1, 2, 1)
plt.scatter(X[:, 0], X[:, 1], alpha=0.8, label='Original Data')
plt.title('Original Data')

# Perform PCA
pca = PCA(n_components=2)
newX = pca.fit_transform(X)

# Create a scatter plot of the transformed data after PCA
plt.subplot(1, 2, 2)
plt.scatter(newX[:, 0], newX[:, 1], alpha=0.8, label='Transformed Data')
plt.title('Data After PCA')

# Plotting the principal components (if needed)
plt.quiver(0, 0, pca.components_[0, 0], pca.components_[0, 1], angles='xy', scale_units='xy', scale=0.5, color='r')
plt.quiver(0, 0, pca.components_[1, 0], pca.components_[1, 1], angles='xy', scale_units='xy', scale=0.5, color='b')

plt.legend()
plt.tight_layout()
plt.show()
In [ ]:
# Create a scatter plot of the first two attributes of the original data
plt.figure(figsize=(15, 5))
plt.subplot(1, 2, 1)
plt.scatter(X[:, 0], X[:, 4], alpha=0.8, label='Original Data (Attributes 1 and 2)')
plt.title('Original Data')

# Perform PCA
pca = PCA(n_components=2)
newX = pca.fit_transform(X)

# Create a scatter plot of the transformed data after PCA
plt.subplot(1, 2, 2)
plt.scatter(newX[:, 0], newX[:, 1], alpha=0.8, label='Transformed Data')
plt.title('Data After PCA')

# Plotting the principal components (if needed)
for i in range(pca.components_.shape[0]):
    plt.quiver(0, 0, pca.components_[i, 0], pca.components_[i, 1],
               angles='xy', scale_units='xy', scale=0.5, color='r')

plt.legend()
plt.tight_layout()
plt.show()
In [ ]:
kl_divergence = []

# Iterate over a range of components to fit t-SNE models
for n_components in range(1, min(X.shape[1], 4)):
    tsne = TSNE(n_components=n_components, random_state=42, method='barnes_hut')
    X_tsne = tsne.fit_transform(X)
    kl_divergence.append(tsne.kl_divergence_)

# Plot KL Divergence for each number of components
plt.figure(figsize=(10, 5))
plt.plot(range(1, len(kl_divergence) + 1), kl_divergence, marker='o')
plt.title('KL Divergence by Number of Components in t-SNE (barnes_hut)')
plt.xlabel('Number of Components')
plt.ylabel('KL Divergence')
plt.show()
In [ ]:
# Create a t-SNE model with 2 components
tsne = TSNE(n_components=2)

# Fit and transform the data
X_tsne = tsne.fit_transform(X)

# Create a scatter plot of the t-SNE-transformed data
plt.figure(figsize=(10, 6))
plt.scatter(X_tsne[:, 0], X_tsne[:, 1], alpha=0.8)
plt.title('t-SNE Visualization')
plt.xlabel('t-SNE Component 1')
plt.ylabel('t-SNE Component 2')
plt.show()
In [ ]:
from mpl_toolkits.mplot3d import Axes3D

tsne_3d = TSNE(n_components=3)
X_tsne_3d = tsne_3d.fit_transform(X)

fig = plt.figure(figsize=(20, 14))
ax = fig.add_subplot(111, projection='3d')
ax.scatter(X_tsne_3d[:, 0], X_tsne_3d[:, 1], X_tsne_3d[:, 2], alpha=0.8)
ax.set_title('t-SNE 3D Visualization')
ax.set_xlabel('t-SNE Component 1')
ax.set_ylabel('t-SNE Component 2')
ax.set_zlabel('t-SNE Component 3')
plt.show()
In [ ]:
# Perform PCA
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X)

# Perform t-SNE
tsne = TSNE(n_components=2)
X_tsne = tsne.fit_transform(X)

# Create subplots
fig, axes = plt.subplots(1, 2, figsize=(15, 5))

# Scatter plot for PCA
axes[0].scatter(X_pca[:, 0], X_pca[:, 1], alpha=0.8)
axes[0].set_title('PCA Visualization')
axes[0].set_xlabel('PCA Component 1')
axes[0].set_ylabel('PCA Component 2')

# Scatter plot for t-SNE
axes[1].scatter(X_tsne[:, 0], X_tsne[:, 1], alpha=0.8)
axes[1].set_title('t-SNE Visualization')
axes[1].set_xlabel('t-SNE Component 1')
axes[1].set_ylabel('t-SNE Component 2')

plt.tight_layout()
plt.show()

format: html: embed-resources: true code-fold: true


Import¶

In [ ]:
import pandas as pd
import seaborn as sns 
import matplotlib.pyplot as plt
from sklearn import tree
from IPython.display import Image
import numpy as np
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

4.1.1: Import¶

The following code will import the data file into a pandas data-frame

In [ ]:
# LOAD THE DATAFRAME
from sklearn.datasets import load_breast_cancer
(x,y) = load_breast_cancer(return_X_y=True,as_frame=True)
df=pd.concat([x,y],axis=1)

# LOOK AT FIRST ROW
print(df.iloc[0])
mean radius                  17.990000
mean texture                 10.380000
mean perimeter              122.800000
mean area                  1001.000000
mean smoothness               0.118400
mean compactness              0.277600
mean concavity                0.300100
mean concave points           0.147100
mean symmetry                 0.241900
mean fractal dimension        0.078710
radius error                  1.095000
texture error                 0.905300
perimeter error               8.589000
area error                  153.400000
smoothness error              0.006399
compactness error             0.049040
concavity error               0.053730
concave points error          0.015870
symmetry error                0.030030
fractal dimension error       0.006193
worst radius                 25.380000
worst texture                17.330000
worst perimeter             184.600000
worst area                 2019.000000
worst smoothness              0.162200
worst compactness             0.665600
worst concavity               0.711900
worst concave points          0.265400
worst symmetry                0.460100
worst fractal dimension       0.118900
target                        0.000000
Name: 0, dtype: float64
In [ ]:
df = pd.read_csv("../data/modified-data/stock_cluster.csv")
df = df[['Open', 'High', 'Low', 'Close', 'Volume', 'Adjusted', 'dn', 'mavg', 'up', 'Stock']]
In [ ]:
# # RUN THE FOLLOWING CODE TO GENERATE A SEABORN PAIRPLOT 
tmp=pd.concat([df.sample(n=10,axis=1),y],axis=1)
print(tmp.shape)
sns.pairplot(tmp,hue="target", diag_kind='kde')
plt.show()
(1431, 11)
In [ ]:
# Convert the stock to categorical
df['Stock'] = df['Stock'].astype('category')
# Encode the stock as numeric using cat.codes
df['Stock'] = df['Stock'].cat.codes
df.rename(columns={"Stock": 'target'}, inplace=True)
In [ ]:
# INSERT CODE TO PRINT ITS SHAPE AND COLUMN NAMES
print(df.shape)
print(df.columns)
(1431, 10)
Index(['Open', 'High', 'Low', 'Close', 'Volume', 'Adjusted', 'dn', 'mavg',
       'up', 'target'],
      dtype='object')

4.1.2: Basic data exploration¶

We will be using y="target" (output target) and all other remaining columns as our X (input feature) matrix.

Before doing analysis it is always good to "get inside" the data and see what we are working with

In [ ]:
#INSERT CODE TO PRINT THE FOLLOWING DATA-FRAME WHICH SUMMARIZES EACH COLUMN 
summary_stats = df.describe().transpose()[['min', 'mean', 'max']]
dtypes = df.dtypes
summary = pd.concat([dtypes, summary_stats], axis=1)
summary.columns = ['dtypes', 'min', 'mean', 'max']
print(summary)
           dtypes           min          mean           max
Open      float64  1.018000e+02  2.090672e+02  6.056100e+02
High      float64  1.045400e+02  2.120809e+02  6.099900e+02
Low       float64  1.012800e+02  2.060704e+02  5.905600e+02
Close     float64  1.019600e+02  2.091259e+02  5.973700e+02
Volume      int64  1.404700e+06  3.198843e+07  1.826020e+08
Adjusted  float64  9.900871e+01  2.076091e+02  5.973700e+02
dn        float64  9.866728e+01  1.920277e+02  5.872915e+02
mavg      float64  1.086962e+02  2.100733e+02  6.072508e+02
up        float64  1.161748e+02  2.281188e+02  6.778163e+02
target       int8  0.000000e+00  1.000000e+00  2.000000e+00
In [ ]:
# INSERT CODE TO EXPLORE THE LOAD BALANCE AND COUNT THE NUMBER OF SAMPLES FOR EACH TARGET (THEN PRINT THE RESULT)
target_counts = df['target'].value_counts()

for target, count in target_counts.items():
    print(f"Number of points with target = {target}: {count} {count / len(df)}")
Number of points with target=2: 477 0.3333333333333333
Number of points with target=0: 477 0.3333333333333333
Number of points with target=1: 477 0.3333333333333333
In [ ]:
# RUN THE FOLLOWING CODE TO SHOW THE HEAT-MAP FOR THE CORRELATION MATRIX
corr = df.corr();  #print(corr)					#COMPUTE CORRELATION OF FEATER MATRIX
print(corr.shape)
sns.set_theme(style="white")
f, ax = plt.subplots(figsize=(20, 20))  # Set up the matplotlib figure
cmap = sns.diverging_palette(230, 20, as_cmap=True) 	# Generate a custom diverging colormap
# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(corr,  annot=True, cmap=cmap, vmin=-1, vmax=1, center=0,
        square=True, linewidths=.5, cbar_kws={"shrink": .5})
plt.show();
(10, 10)

When the dataset is very large then the seaborn pairplot is often very slow.

However, in this case it can still be useful to look at a subset of the features

4.1.3 Isolate inputs/output & Split data¶

In [ ]:
# INSERT CODE TO MAKE DATA-FRAMES (or numpy arrays) (X,Y) WHERE Y="target" COLUMN and X="everything else"
X = df.drop(columns=['target']) 
Y = df['target']

#X_array = X.values
#Y_array = Y.values
In [ ]:
# INSERT CODE TO PARTITION THE DATASET INTO TRAINING AND TEST SETS
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2)
In [ ]:
# INSERT CODE, AS A CONSISTENCY CHECK, TO PRINT THE TYPE AND SHAPE OF x_train, x_test, y_train, y_test
print(type(x_train), x_train.shape)
print(type(y_train), y_train.shape)
print(type(x_test), x_test.shape)
print(type(y_test), y_test.shape)
<class 'pandas.core.frame.DataFrame'> (1144, 9)
<class 'pandas.core.series.Series'> (1144,)
<class 'pandas.core.frame.DataFrame'> (287, 9)
<class 'pandas.core.series.Series'> (287,)

4.1.4 Training the model¶

In [ ]:
## INSERT CODE BELOW TO TRAIN A SKLEARN DECISION TREE MODEL ON x_train,y_train 
model = tree.DecisionTreeClassifier()
model = model.fit(x_train, y_train)

4.1.5 Check the results¶

Evaluate the performance of the decision tree model by using the test data.

In [ ]:
# INSERT CODE TO USE THE MODEL TO MAKE PREDICTIONS FOR THE TRAINING AND TEST SET 
yp_train = model.predict(x_train)
yp_test = model.predict(x_test)

Use the following reference to display the confusion matrix. SKlearn Confusion Matrix will give you the code you need.

In the function below, also print the following as part of the function output

ACCURACY: 0.9035087719298246
NEGATIVE RECALL (Y=0): 0.9574468085106383
NEGATIVE PRECISION (Y=0): 0.8333333333333334
POSITIVE RECALL (Y=1): 0.8656716417910447
POSITIVE PRECISION (Y=1): 0.9666666666666667
[[45  2]
[ 9 58]]
In [ ]:
#INSERT CODE TO WRITE A FUNCTION def confusion_plot(y_data,y_pred) WHICH GENERATES A CONFUSION MATRIX PLOT AND PRINTS THE INFORMATION ABOVE (see link above for example)
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

def confusion_plot(y,yp):
    cm = confusion_matrix(y, yp)
    accuracy = accuracy_score(y, yp)
    precision_0 = precision_score(y, yp, pos_label=0)
    precision_1 = precision_score(y, yp, pos_label=1)
    recall_0 = recall_score(y, yp, pos_label=0)
    recall_1 = recall_score(y, yp, pos_label=1)
    
    print("ACCURACY:", accuracy)
    print("NEGATIVE RECALL (Y=0):", recall_0)
    print("NEGATIVE PRECISION (Y=0):", precision_0)
    print("POSITIVE RECALL (Y=1):", recall_1)
    print("POSITIVE PRECISION (Y=1):", precision_1)
    print(cm)

    disp = ConfusionMatrixDisplay(confusion_matrix=cm)
    disp.plot()
    plt.show()
In [ ]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, accuracy_score, precision_score, recall_score
import matplotlib.pyplot as plt

def confusion_plot(y, yp):
    cm = confusion_matrix(y, yp)
    accuracy = accuracy_score(y, yp)
    precision_0 = precision_score(y, yp, average=None)[0]
    precision_1 = precision_score(y, yp, average=None)[1]
    precision_2 = precision_score(y, yp, average=None)[2]
    recall_0 = recall_score(y, yp, average=None)[0]
    recall_1 = recall_score(y, yp, average=None)[1]
    recall_2 = recall_score(y, yp, average=None)[2]

    # Set the figure size directly
    plt.figure(figsize=(8, 8))

    print("ACCURACY:", accuracy)
    print("CLASS 0 RECALL:", recall_0)
    print("CLASS 0 PRECISION:", precision_0)
    print("CLASS 1 RECALL:", recall_1)
    print("CLASS 1 PRECISION:", precision_1)
    print("CLASS 2 RECALL:", recall_2)
    print("CLASS 2 PRECISION:", precision_2)
    print(cm)

    # Create ConfusionMatrixDisplay
    disp = ConfusionMatrixDisplay(confusion_matrix=cm)
    disp.plot()
    plt.show()

# Example usage:
# confusion_plot(y_true, y_pred)
In [ ]:
# RUN THE FOLLOWING CODE TO TEST YOUR FUNCTION 
print("------TRAINING------")
confusion_plot(y_train,yp_train)
print("------TEST------")
confusion_plot(y_test,yp_test)
------TRAINING------
ACCURACY: 1.0
CLASS 0 RECALL: 1.0
CLASS 0 PRECISION: 1.0
CLASS 1 RECALL: 1.0
CLASS 1 PRECISION: 1.0
CLASS 2 RECALL: 1.0
CLASS 2 PRECISION: 1.0
[[383   0   0]
 [  0 377   0]
 [  0   0 384]]
<Figure size 576x576 with 0 Axes>
------TEST------
ACCURACY: 0.9930313588850174
CLASS 0 RECALL: 0.9893617021276596
CLASS 0 PRECISION: 0.9893617021276596
CLASS 1 RECALL: 0.99
CLASS 1 PRECISION: 1.0
CLASS 2 RECALL: 1.0
CLASS 2 PRECISION: 0.9893617021276596
[[93  0  1]
 [ 1 99  0]
 [ 0  0 93]]
<Figure size 576x576 with 0 Axes>

4.1.6 Visualize the tree¶

In [ ]:
# INSERT CODE TO WRITE A FUNCTION "def plot_tree(model,X,Y)" VISUALIZE THE DECISION TREE (see https://mljar.com/blog/visualize-decision-tree/ for an example)
def plot_tree(model, X, Y):
    plt.figure(figsize=(20, 10))
    tree.plot_tree(model, feature_names=X.columns, class_names=[str(class_label) for class_label in model.classes_], filled=True)
    plt.show()

plot_tree(model, X, Y)

4.1.6 Hyper-parameter turning¶

The "max_depth" hyper-parameter lets us control the number of layers in our tree.

Lets iterate over "max_depth" and try to find the set of hyper-parameters with the lowest training AND test error.

In [ ]:
import matplotlib.pyplot as plt
import numpy as np
from sklearn import tree
from sklearn.metrics import accuracy_score, precision_score, recall_score

# COMPLETE THE FOLLOWING CODE TO LOOP OVER POSSIBLE HYPER-PARAMETERS VALUES
test_results = []
train_results = []

for num_layer in range(1, 20):
    model = tree.DecisionTreeClassifier(max_depth=num_layer)
    model = model.fit(x_train, y_train)

    yp_train = model.predict(x_train)
    yp_test = model.predict(x_test)

    test_results.append([num_layer, accuracy_score(y_test, yp_test), precision_score(y_test, yp_test, average=None),
                         recall_score(y_test, yp_test, average=None)])
    train_results.append([num_layer, accuracy_score(y_train, yp_train), precision_score(y_train, yp_train, average=None),
                          recall_score(y_train, yp_train, average=None)])

# Extract results for plotting
num_layers = [result[0] for result in test_results]

# Accuracy
test_accuracies = [result[1] for result in test_results]
train_accuracies = [result[1] for result in train_results]

# Recall
test_recalls_class_0 = [result[2][0] for result in test_results]
train_recalls_class_0 = [result[2][0] for result in train_results]

test_recalls_class_1 = [result[2][1] for result in test_results]
train_recalls_class_1 = [result[2][1] for result in train_results]

test_recalls_class_2 = [result[2][2] for result in test_results]
train_recalls_class_2 = [result[2][2] for result in train_results]

# Precision
test_precisions_class_0 = [result[3][0] for result in test_results]
train_precisions_class_0 = [result[3][0] for result in train_results]

test_precisions_class_1 = [result[3][1] for result in test_results]
train_precisions_class_1 = [result[3][1] for result in train_results]

test_precisions_class_2 = [result[3][2] for result in test_results]
train_precisions_class_2 = [result[3][2] for result in train_results]

# Accuracy
plt.figure(figsize=(10,6))
plt.plot(num_layers, test_accuracies, 'o-', label='Test Accuracy')
plt.plot(num_layers, train_accuracies, 'o-', label='Train Accuracy')
plt.title('Accuracy vs. Number of Layers')
plt.xlabel('Number of Layers')
plt.ylabel('Accuracy')
plt.legend()
plt.show()

# Recall Class 0
plt.figure(figsize=(10,6))
plt.plot(num_layers, test_recalls_class_0, 'o-', label='Test Recall Class 0')
plt.plot(num_layers, train_recalls_class_0, 'o-', label='Train Recall Class 0')
plt.title('Recall Class 0 vs. Number of Layers')
plt.xlabel('Number of Layers')
plt.ylabel('Recall Class 0')
plt.legend()
plt.show()

# Recall Class 1
plt.figure(figsize=(10,6))
plt.plot(num_layers, test_recalls_class_1, 'o-', label='Test Recall Class 1')
plt.plot(num_layers, train_recalls_class_1, 'o-', label='Train Recall Class 1')
plt.title('Recall Class 1 vs. Number of Layers')
plt.xlabel('Number of Layers')
plt.ylabel('Recall Class 1')
plt.legend()
plt.show()

# Recall Class 2
plt.figure(figsize=(10,6))
plt.plot(num_layers, test_recalls_class_2, 'o-', label='Test Recall Class 2')
plt.plot(num_layers, train_recalls_class_2, 'o-', label='Train Recall Class 2')
plt.title('Recall Class 2 vs. Number of Layers')
plt.xlabel('Number of Layers')
plt.ylabel('Recall Class 2')
plt.legend()
plt.show()

# Precision Class 0
plt.figure(figsize=(10,6))
plt.plot(num_layers, test_precisions_class_0, 'o-', label='Test Precision Class 0')
plt.plot(num_layers, train_precisions_class_0, 'o-', label='Train Precision Class 0')
plt.title('Precision Class 0 vs. Number of Layers')
plt.xlabel('Number of Layers')
plt.ylabel('Precision Class 0')
plt.legend()
plt.show()

# Precision Class 1
plt.figure(figsize=(10,6))
plt.plot(num_layers, test_precisions_class_1, 'o-', label='Test Precision Class 1')
plt.plot(num_layers, train_precisions_class_1, 'o-', label='Train Precision Class 1')
plt.title('Precision Class 1 vs. Number of Layers')
plt.xlabel('Number of Layers')
plt.ylabel('Precision Class 1')
plt.legend()
plt.show()

# Precision Class 2
plt.figure(figsize=(10,6))
plt.plot(num_layers, test_precisions_class_2, 'o-', label='Test Precision Class 2')
plt.plot(num_layers, train_precisions_class_2, 'o-', label='Train Precision Class 2')
plt.title('Precision Class 2 vs. Number of Layers')
plt.xlabel('Number of Layers')
plt.ylabel('Precision Class 2')
plt.legend()
plt.show()
/Users/chendong/opt/anaconda3/lib/python3.8/site-packages/sklearn/metrics/_classification.py:1471: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
/Users/chendong/opt/anaconda3/lib/python3.8/site-packages/sklearn/metrics/_classification.py:1471: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))

4.1.7 Train optimal model¶

Re-train the decision tree using the optimal hyper-parameter obtained from the plot above

In [ ]:
#### COMPLETE THE CODE BELOW TO TRAIN A SKLEARN DECISION TREE MODEL ON x_train,y_train 
from sklearn import tree
model = tree.DecisionTreeClassifier(max_depth=4)
model = model.fit(x_train, y_train)

yp_train=model.predict(x_train)
yp_test=model.predict(x_test)
In [ ]:
# RUN THE FOLLOWING CODE TO EVALUATE YOUR MODEL
print("------TRAINING------")
confusion_plot(y_train,yp_train)
print("------TEST------")
confusion_plot(y_test,yp_test)

plot_tree(model,X,Y)
------TRAINING------
ACCURACY: 1.0
CLASS 0 RECALL: 1.0
CLASS 0 PRECISION: 1.0
CLASS 1 RECALL: 1.0
CLASS 1 PRECISION: 1.0
CLASS 2 RECALL: 1.0
CLASS 2 PRECISION: 1.0
[[383   0   0]
 [  0 377   0]
 [  0   0 384]]
<Figure size 576x576 with 0 Axes>
------TEST------
ACCURACY: 0.9930313588850174
CLASS 0 RECALL: 0.9893617021276596
CLASS 0 PRECISION: 0.9893617021276596
CLASS 1 RECALL: 0.99
CLASS 1 PRECISION: 1.0
CLASS 2 RECALL: 1.0
CLASS 2 PRECISION: 0.9893617021276596
[[93  0  1]
 [ 1 99  0]
 [ 0  0 93]]
<Figure size 576x576 with 0 Axes>
In [ ]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix


# Initialize the RandomForestClassifier
rf_classifier = RandomForestClassifier(n_estimators=200, min_samples_split=4, min_samples_leaf=3, max_features='log2', max_depth=70)

# Train the model
rf_classifier.fit(x_train, y_train)

# Make predictions on the test set
y_pred = rf_classifier.predict(x_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

# Print the evaluation metrics
print(f"Accuracy: {accuracy}")
print(f"Confusion Matrix:\n{conf_matrix}")
print(f"Classification Report:\n{class_report}")
Accuracy: 0.9930313588850174
Confusion Matrix:
[[93  0  1]
 [ 1 99  0]
 [ 0  0 93]]
Classification Report:
              precision    recall  f1-score   support

           0       0.99      0.99      0.99        94
           1       1.00      0.99      0.99       100
           2       0.99      1.00      0.99        93

    accuracy                           0.99       287
   macro avg       0.99      0.99      0.99       287
weighted avg       0.99      0.99      0.99       287

In [ ]:
# Import necessary libraries
import matplotlib.pyplot as plt
import seaborn as sns

# Plot confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', cbar=False)
plt.title('Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('True')
plt.show()

# Plot feature importances
feature_importances = rf_classifier.feature_importances_
feature_names = X.columns

plt.figure(figsize=(10, 6))
sns.barplot(x=feature_importances, y=feature_names, palette='viridis')
plt.title('Feature Importances')
plt.xlabel('Importance')
plt.ylabel('Feature')
plt.show()
<ipython-input-143-6fd300f0b582>:18: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(x=feature_importances, y=feature_names, palette='viridis')
In [ ]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Define the parameter distributions
param_dist = {
    'n_estimators': np.arange(50, 501, 50),
    'max_depth': [None] + list(np.arange(10, 101, 10)),
    'min_samples_split': np.arange(2, 11),
    'min_samples_leaf': np.arange(1, 11),
    'max_features': ['auto', 'sqrt', 'log2', None],
    'bootstrap': [True, False]
}

# Create the base model
rf_base = RandomForestClassifier(random_state=42)

# Initialize RandomizedSearchCV
random_search = RandomizedSearchCV(estimator=rf_base, param_distributions=param_dist,
                                   n_iter=100, cv=3, scoring='accuracy', n_jobs=-1, random_state=42)

# Fit the model to the data
random_search.fit(x_train, y_train)

# Print the best parameters
print("Best Parameters:", random_search.best_params_)

# Get the best model
best_rf_model = random_search.best_estimator_

# Evaluate the model on the test set
y_pred = best_rf_model.predict(x_test)
accuracy = accuracy_score(y_test, y_pred)
print("Test Accuracy:", accuracy)

# Visualize the effect of two hyperparameters
param1 = 'n_estimators'
param2 = 'max_depth'

# Extract the results from the RandomizedSearchCV
results = random_search.cv_results_
param1_values = results['param_' + param1].data.astype(float)
param2_values = results['param_' + param2].data.astype(float)
mean_test_scores = results['mean_test_score']

# Create a scatter plot
plt.figure(figsize=(12, 8))
sc = plt.scatter(param1_values, param2_values, c=mean_test_scores, cmap='viridis', marker='o', edgecolors='k')
plt.colorbar(sc, label='Mean Test Score')
plt.title(f'RandomizedSearchCV Results: {param1} vs {param2}')
plt.xlabel(param1)
plt.ylabel(param2)
plt.show()
/Users/chendong/opt/anaconda3/lib/python3.8/site-packages/sklearn/model_selection/_validation.py:425: FitFailedWarning: 
57 fits failed out of a total of 300.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
43 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/chendong/opt/anaconda3/lib/python3.8/site-packages/sklearn/model_selection/_validation.py", line 729, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/chendong/opt/anaconda3/lib/python3.8/site-packages/sklearn/base.py", line 1145, in wrapper
    estimator._validate_params()
  File "/Users/chendong/opt/anaconda3/lib/python3.8/site-packages/sklearn/base.py", line 638, in _validate_params
    validate_parameter_constraints(
  File "/Users/chendong/opt/anaconda3/lib/python3.8/site-packages/sklearn/utils/_param_validation.py", line 96, in validate_parameter_constraints
    raise InvalidParameterError(
sklearn.utils._param_validation.InvalidParameterError: The 'max_features' parameter of RandomForestClassifier must be an int in the range [1, inf), a float in the range (0.0, 1.0], a str among {'sqrt', 'log2'} or None. Got 'auto' instead.

--------------------------------------------------------------------------------
14 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/chendong/opt/anaconda3/lib/python3.8/site-packages/sklearn/model_selection/_validation.py", line 729, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/chendong/opt/anaconda3/lib/python3.8/site-packages/sklearn/base.py", line 1145, in wrapper
    estimator._validate_params()
  File "/Users/chendong/opt/anaconda3/lib/python3.8/site-packages/sklearn/base.py", line 638, in _validate_params
    validate_parameter_constraints(
  File "/Users/chendong/opt/anaconda3/lib/python3.8/site-packages/sklearn/utils/_param_validation.py", line 96, in validate_parameter_constraints
    raise InvalidParameterError(
sklearn.utils._param_validation.InvalidParameterError: The 'max_features' parameter of RandomForestClassifier must be an int in the range [1, inf), a float in the range (0.0, 1.0], a str among {'log2', 'sqrt'} or None. Got 'auto' instead.

  warnings.warn(some_fits_failed_message, FitFailedWarning)
/Users/chendong/opt/anaconda3/lib/python3.8/site-packages/sklearn/model_selection/_search.py:979: UserWarning: One or more of the test scores are non-finite: [0.99475524        nan 0.99650273        nan 0.99388035 0.99562784
 0.99562555 0.99388035 0.99475524 0.99388035 0.99388035        nan
        nan 0.99650273 0.99388035        nan 0.98775611 0.99475295
 0.98775611 0.98950589 0.98775611        nan        nan 0.99388035
 0.99388035 0.99562555 0.99125567 0.99388035 0.99475295 0.99737533
 0.99388035        nan        nan        nan 0.99650273 0.99213056
 0.99475524 0.99562555 0.98775611 0.99562555 0.99475524 0.98775611
 0.99650044 0.99388035 0.99388035 0.99038078 0.99388035 0.99562555
 0.99388035 0.99388035 0.99388035 0.98775611 0.99388035 0.99650273
 0.99475524 0.99388035        nan 0.99388035        nan 0.99562784
        nan        nan        nan 0.99562784 0.99388035 0.99388035
 0.99388035 0.99650273 0.99562784 0.99475524 0.99388035 0.99388035
 0.99475524 0.99562784        nan        nan 0.99650273 0.99650273
 0.99388035 0.99388035 0.99475295 0.99388035 0.99563013 0.99388035
 0.98950589        nan 0.99213056        nan 0.99562784 0.99475295
 0.99038078 0.99388035 0.98775611 0.99562555 0.99562784 0.99650273
 0.99388035 0.99212827 0.99562555 0.99388035]
  warnings.warn(
Best Parameters: {'n_estimators': 200, 'min_samples_split': 4, 'min_samples_leaf': 3, 'max_features': 'log2', 'max_depth': 70, 'bootstrap': False}
Test Accuracy: 0.9930313588850174
In [ ]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Define the parameter grid
param_grid = {
    'n_estimators': np.arange(50, 501, 50),
    'max_depth': [None] + list(np.arange(10, 101, 10)),
    'min_samples_split': np.arange(2, 11),
    'min_samples_leaf': np.arange(1, 11),
    'max_features': ['auto', 'sqrt', 'log2', None]
}

# Create a Random Forest classifier
rf_classifier = RandomForestClassifier(random_state=42)

# Initialize RandomizedSearchCV
random_search = RandomizedSearchCV(
    rf_classifier,
    param_distributions=param_grid,
    n_iter=10,  # You can adjust the number of iterations
    scoring='accuracy',
    cv=5,  # You can adjust the number of cross-validation folds
    random_state=42,
    n_jobs=-1  # Use -1 to use all available CPU cores
)

# Fit the RandomizedSearchCV
random_search.fit(x_train, y_train)

# Get the best hyperparameters
best_params = random_search.best_params_

# Initialize the base model with the best hyperparameters
best_rf_model = RandomForestClassifier(**best_params, random_state=42)

# Lists to store results
train_accuracies = []
test_accuracies = []

# Plotting n_estimators over accuracy
plt.figure(figsize=(10, 6))
for n_estimators in param_grid['n_estimators']:
    best_rf_model.n_estimators = n_estimators
    best_rf_model.fit(x_train, y_train)
    
    train_pred = best_rf_model.predict(x_train)
    test_pred = best_rf_model.predict(x_test)
    
    train_accuracies.append(accuracy_score(y_train, train_pred))
    test_accuracies.append(accuracy_score(y_test, test_pred))

# Plot points with dots and connect with lines
plt.plot(param_grid['n_estimators'], train_accuracies, 'o-', label='Train Accuracy')
plt.plot(param_grid['n_estimators'], test_accuracies, 'o-', label='Test Accuracy')
plt.title('Effect of n_estimators on Accuracy')
plt.xlabel('n_estimators')
plt.ylabel('Accuracy')
plt.legend()
plt.show()

# Similar code can be used for other hyperparameters (max_depth, min_samples_split, min_samples_leaf, max_features)
/Users/chendong/opt/anaconda3/lib/python3.8/site-packages/sklearn/model_selection/_validation.py:425: FitFailedWarning: 
15 fits failed out of a total of 50.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
10 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/chendong/opt/anaconda3/lib/python3.8/site-packages/sklearn/model_selection/_validation.py", line 729, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/chendong/opt/anaconda3/lib/python3.8/site-packages/sklearn/base.py", line 1145, in wrapper
    estimator._validate_params()
  File "/Users/chendong/opt/anaconda3/lib/python3.8/site-packages/sklearn/base.py", line 638, in _validate_params
    validate_parameter_constraints(
  File "/Users/chendong/opt/anaconda3/lib/python3.8/site-packages/sklearn/utils/_param_validation.py", line 96, in validate_parameter_constraints
    raise InvalidParameterError(
sklearn.utils._param_validation.InvalidParameterError: The 'max_features' parameter of RandomForestClassifier must be an int in the range [1, inf), a float in the range (0.0, 1.0], a str among {'sqrt', 'log2'} or None. Got 'auto' instead.

--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/chendong/opt/anaconda3/lib/python3.8/site-packages/sklearn/model_selection/_validation.py", line 729, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/chendong/opt/anaconda3/lib/python3.8/site-packages/sklearn/base.py", line 1145, in wrapper
    estimator._validate_params()
  File "/Users/chendong/opt/anaconda3/lib/python3.8/site-packages/sklearn/base.py", line 638, in _validate_params
    validate_parameter_constraints(
  File "/Users/chendong/opt/anaconda3/lib/python3.8/site-packages/sklearn/utils/_param_validation.py", line 96, in validate_parameter_constraints
    raise InvalidParameterError(
sklearn.utils._param_validation.InvalidParameterError: The 'max_features' parameter of RandomForestClassifier must be an int in the range [1, inf), a float in the range (0.0, 1.0], a str among {'log2', 'sqrt'} or None. Got 'auto' instead.

  warnings.warn(some_fits_failed_message, FitFailedWarning)
/Users/chendong/opt/anaconda3/lib/python3.8/site-packages/sklearn/model_selection/_search.py:979: UserWarning: One or more of the test scores are non-finite: [0.99562936        nan 0.99562936        nan 0.99388263 0.99562936
 0.99562936        nan 0.99562936 0.99388263]
  warnings.warn(
In [ ]:
# Lists to store results
train_accuracies = []
test_accuracies = []

# Plotting max_depth over accuracy
plt.figure(figsize=(10, 6))
for max_depth in param_grid['max_depth']:
    best_rf_model.max_depth = max_depth
    best_rf_model.fit(x_train, y_train)
    
    train_pred = best_rf_model.predict(x_train)
    test_pred = best_rf_model.predict(x_test)
    
    train_accuracies.append(accuracy_score(y_train, train_pred))
    test_accuracies.append(accuracy_score(y_test, test_pred))

# Plot points with dots and connect with lines
plt.plot(param_grid['max_depth'], train_accuracies, 'o-', label='Train Accuracy')
plt.plot(param_grid['max_depth'], test_accuracies, 'o-', label='Test Accuracy')
plt.title('Effect of max_depth on Accuracy')
plt.xlabel('max_depth')
plt.ylabel('Accuracy')
plt.legend()
plt.show()
In [ ]:
# Lists to store results
train_accuracies = []
test_accuracies = []

# Plotting min_samples_split over accuracy
plt.figure(figsize=(10, 6))
for min_samples_split in param_grid['min_samples_split']:
    best_rf_model.min_samples_split = min_samples_split
    best_rf_model.fit(x_train, y_train)
    
    train_pred = best_rf_model.predict(x_train)
    test_pred = best_rf_model.predict(x_test)
    
    train_accuracies.append(accuracy_score(y_train, train_pred))
    test_accuracies.append(accuracy_score(y_test, test_pred))

# Plot points with dots and connect with lines
plt.plot(param_grid['min_samples_split'], train_accuracies, 'o-', label='Train Accuracy')
plt.plot(param_grid['min_samples_split'], test_accuracies, 'o-', label='Test Accuracy')
plt.title('Effect of min_samples_split on Accuracy')
plt.xlabel('min_samples_split')
plt.ylabel('Accuracy')
plt.legend()
plt.show()
In [ ]:
# Lists to store results
train_accuracies = []
test_accuracies = []

# Plotting min_samples_leaf over accuracy
plt.figure(figsize=(10, 6))
for min_samples_leaf in param_grid['min_samples_leaf']:
    best_rf_model.min_samples_leaf = min_samples_leaf
    best_rf_model.fit(x_train, y_train)
    
    train_pred = best_rf_model.predict(x_train)
    test_pred = best_rf_model.predict(x_test)
    
    train_accuracies.append(accuracy_score(y_train, train_pred))
    test_accuracies.append(accuracy_score(y_test, test_pred))

# Plot points with dots and connect with lines
plt.plot(param_grid['min_samples_leaf'], train_accuracies, 'o-', label='Train Accuracy')
plt.plot(param_grid['min_samples_leaf'], test_accuracies, 'o-', label='Test Accuracy')
plt.title('Effect of min_samples_leaf on Accuracy')
plt.xlabel('min_samples_leaf')
plt.ylabel('Accuracy')
plt.legend()
plt.show()